---
# **LAB 5 - Global memory (GMEM)**
---

# ▶️ CUDA setup

In [None]:
!nvcc --version

In [None]:
!nvidia-smi

GPU computing notebooks download (from github)

In [None]:
!git clone https://github.com/giulianogrossi/GPUcomputing.git

NVCC Plugin for Jupyter notebook

In [None]:
%cd GPUcomputing/utils/nvcc4jupyter-master/
!!python3 -m build
%load_ext nvcc4jupyter
%cd /content/

# ✅ Static and pinned memory

Static memory...

In [None]:
%%cuda_group_save --name "static_mem.cu" --group "lez5"
#include "/content/GPUcomputing/utils/common.h"
#include <stdio.h>
#define SIZE 16

// global memory
__device__ int  d_value[SIZE];
int             h_value[SIZE];

// kernel
__global__ void write_value() {
  d_value[threadIdx.x] += threadIdx.x;
  printf("value GPU[%d] = %d\n", threadIdx.x, d_value[threadIdx.x]);
}

int main() {

  // load host data
  for (int i = 0; i < SIZE; i++)
    h_value[i] = i;

  // copy H2D using symbols
  CHECK(cudaMemcpyToSymbol(d_value, h_value, sizeof(h_value)));

  // kernel launch
  write_value<<<1, SIZE>>>();

  // Synchronize required before cudaMemcpy was synchronizing
  CHECK(cudaDeviceSynchronize());

  // copy D2H using symbols
  CHECK(cudaMemcpyFromSymbol(h_value, d_value, sizeof(h_value)));
  for (int i = 0; i < SIZE; i++)
    printf("value CPU [%d] = %d\n", i, h_value[i]);
  return 0;
}

↩ **Run...**

In [None]:
!nvcc -arch=sm_75 src/lez5/static_mem.cu -o static_mem
!./static_mem

↘️ **TODO...**

- Usare la pinned memory per il trasferimento dati usando cudaMallocHost

In [None]:
%%cuda_group_save --name "pin_mem.cu" --group "lez5"

#include "/content/GPUcomputing/utils/common.h"
#include <cuda_runtime.h>
#include <stdio.h>

int main(int argc, char **argv) {

  // memory size
  size_t isize = 1 << 25;
  size_t nbytes = isize * sizeof(float);
  printf("memory size = %lu byte (%5.2f MB)\n", isize, nbytes / (1024.0f * 1024.0f));

  float *h_a, *h_b;
  // allocate the host memory
  h_a = (float *)malloc(nbytes);
  h_b = (float *)malloc(nbytes);
  for (uint i = 0; i < isize; i++)
    h_a[i] = 100.10f;

  /***********************************************************/
	/*              cudaMalloc & cudaMemcpy                    */
	/***********************************************************/

  printf("\ncudaMalloc & cudaMemcpy...\n");

  double start = seconds();

  // allocate device memory
  float *d_a;
  CHECK(cudaMalloc((float **)&d_a, nbytes));

  // transfer data from the host to the device
  CHECK(cudaMemcpy(d_a, h_a, nbytes, cudaMemcpyHostToDevice));

  // transfer data from the device to the host
  CHECK(cudaMemcpy(h_b, d_a, nbytes, cudaMemcpyDeviceToHost));

  CHECK(cudaDeviceSynchronize());
  printf("    Elapsed time: %f\n", seconds() - start);


  // free memory
  CHECK(cudaFree(d_a));

  // reset device

  CHECK(cudaDeviceReset());

  /***********************************************************/
	/*                  cudaMallocHost                         */
	/***********************************************************/

  printf("\ncudaMallocHost...\n");
  start = seconds();

  // TODO


  return EXIT_SUCCESS;
}


↩ **Run...**

In [None]:
!nvcc -arch=sm_75 src/lez5/uni_mem.cu -o uni_mem
!./uni_mem

# ✅ Unified memory


In [None]:
%%cuda_group_save --name "uni_mem.cu" --group "lez5"
#include <stdio.h>

__global__ void printme(char *str) {
  printf("%s", str);
}

int main() {
  // Allocate 100 bytes of memory, accessible to both Host and Device code
  char *s;
  cudaMallocManaged(&s, 100);

  // Note direct Host-code use of "s"
  strncpy(s, "Hello Unified Memory GPU\n", 99);

  // Here we pass "s" to a kernel without explicitly copying
  printme<<< 1, 1 >>>(s);

  // Synchronize required (before, cudaMemcpy was synchronizing)
  cudaDeviceSynchronize();

  // change s
  s[21] = 'C';
  printf("%s", s);

  // Free as for normal CUDA allocations
  cudaFree(s);
  return  0;
}

↩ **Run...**

In [None]:
!nvcc -arch=sm_75 src/lez5/uni_mem.cu -o uni_mem
!./uni_mem

↘️ **TODO...**

1. Definire la UMEM per ogni matrice
2. Effettuare la somma invocando il kernel
3. Analizzare i tempi di esecuzione


In [None]:
%%cuda_group_save --name "sum_mat_uni.cu" --group "lez5"

#include "/content/GPUcomputing/utils/common.h"

void initialData(float *ip, const int size) {
  int i;

  for (i = 0; i < size; i++)
    ip[i] = (float)( rand() & 0xFF ) / 10.0f;
  return;
}

void sumMatrixOnHost(float *A, float *B, float *C, const int nx, const int ny) {
  float *ia = A;
  float *ib = B;
  float *ic = C;

  for (int iy = 0; iy < ny; iy++) {
    for (int ix = 0; ix < nx; ix++)
      ic[ix] = ia[ix] + ib[ix];

    ia += nx;
    ib += nx;
    ic += nx;
  }
  return;
}

void checkResult(float *hostRef, float *gpuRef, const int N) {
  double epsilon = 1.0E-8;
  bool match = 1;

  for (int i = 0; i < N; i++) {
    if (abs(hostRef[i] - gpuRef[i]) > epsilon) {
      match = 0;
      printf("host %f gpu %f\n", hostRef[i], gpuRef[i]);
      break;
    }
  }

  if (!match)
    printf("Arrays do not match.\n\n");
}

// grid 2D block 2D
__global__ void sumMatrixGPU(float *MatA, float *MatB, float *MatC, int nx, int ny) {
  unsigned int ix = threadIdx.x + blockIdx.x * blockDim.x;
  unsigned int iy = threadIdx.y + blockIdx.y * blockDim.y;
  unsigned int idx = iy * nx + ix;

  if (ix < nx && iy < ny)
    MatC[idx] = MatA[idx] + MatB[idx];
}

// MAIN
int main(int argc, char **argv) {
  printf("%s Starting ", argv[0]);

  // set up data size of matrix

  // malloc unified host memory

  // TODO

  // invoke kernel at host side

  // TODO

  // free device global memory


  return (0);
}


↩ **Run...**

In [None]:
!nvcc -arch=sm_75 src/lez5/sum_mat_uni.cu -o summat_uni
!./summat_uni 14

# ✅ Transpose

↘️ **TODO...**

passi per la trasposizione con SMEM:

1. definire la dim della SMEM pari alla dim del blocco
2. Il warp scrive i dati nella shared memory in row-major ordering evitando bank conflict sulle scritture. Ogni warp fa una letture coalescente dei dati in global memory
3. sincronizzare i thread


In [None]:
%%cuda_group_save --name "transposeSMEM.cu" --group "lez5"

#include <stdio.h>
#include "/content/GPUcomputing/utils/common.h"
#define INDEX(rows, cols, stride) (rows * stride + cols)
#define BDIMX 32
#define BDIMY 32

// prototipi funzioni
void initialData(float*, const int);
void printData(float*, int, int);
void checkResult(float*, float*, int, int);
void transposeHost(float*, float*, const int, const int);

/*
 * Kernel per il calcolo della matrice trasposta usando la shared memory
 */
__global__ void transposeSmem(float *out, float *in, int nrows, int ncols) {
	// static shared memory

	// TODO

}

// naive: access data in rows
__global__ void copyRow(float *out, float *in, const int nrows,	const int ncols) {
	// matrix coordinate (ix,iy)
	unsigned int row = blockIdx.y * blockDim.y + threadIdx.y;
	unsigned int col = blockIdx.x * blockDim.x + threadIdx.x;

	// transpose with boundary test
	if (row < nrows && col < ncols)
		out[INDEX(col, row, nrows)] = in[INDEX(row, col, ncols)];
}

// naive: access data in cols
__global__ void copyCol(float *out, float *in, const int nrows,	const int ncols) {
	// matrix coordinate (ix,iy)
	unsigned int row = blockIdx.y * blockDim.y + threadIdx.y;
	unsigned int col = blockIdx.x * blockDim.x + threadIdx.x;

	// transpose with boundary test
	if (row < nrows && col < ncols)
		out[INDEX(row, col, ncols)] = in[INDEX(col, row, nrows)];
}

// MAIN
int main(int argc, char **argv) {

	bool iprint = 0;

	// set up array size
	int nrows = 1 << 14;
	int ncols = 1 << 14;

	if (argc > 1)
		iprint = atoi(argv[1]);
	if (argc > 2)
		nrows = atoi(argv[2]);
	if (argc > 3)
		ncols = atoi(argv[3]);

	printf("\nMatrice con nrows = %d ncols = %d\n", nrows, ncols);
	size_t ncells = nrows * ncols;
	size_t nBytes = ncells * sizeof(float);

	// allocate host memory
	float *A_h = (float *) malloc(nBytes);
  float *B_h = (float *) malloc(nBytes);
  float *AT_h = (float *) malloc(nBytes);

	// Allocate Unified Memory – accessible from CPU or GPU
	float *d_A, *d_AT;
	CHECK(cudaMalloc((void** )&d_A, nBytes));
  CHECK(cudaMalloc((void** )&d_AT, nBytes));

	//  initialize host array
	initialData(A_h, nrows * ncols);
	if (iprint)
		printData(A_h, nrows, ncols);

	//  transpose at host side
	transposeHost(A_h, B_h, nrows, ncols);

  /***********************************************************/
	/*                KERNEL: col copy                         */
	/***********************************************************/

  printf("*** KERNEL: col copy  ***\n");
	// tranpose gmem
  CHECK(cudaMemcpy(d_A, A_h, nBytes, cudaMemcpyHostToDevice));
  dim3 block(BDIMX, BDIMY, 1);
	dim3 grid((ncols + block.x - 1) / block.x, (nrows + block.y - 1) / block.y, 1);

  double iStart = seconds();
	copyCol<<<grid, block>>>(d_AT, d_A, nrows, ncols);
	CHECK(cudaDeviceSynchronize());
	double iElaps = seconds() - iStart;

	// check result
	CHECK(cudaMemcpy(AT_h, d_AT, nBytes, cudaMemcpyDeviceToHost));
	checkResult(A_h, B_h, nrows, ncols);

	double ibnd = 2 * ncells * sizeof(float) / 1e9 / iElaps;
	printf("col copy elapsed %f sec\n <<< grid (%d,%d) block (%d,%d)>>> "
			"effective bandwidth %f GB\n\n", iElaps, grid.x, grid.y, block.x,	block.y, ibnd);

  /***********************************************************/
	/*                KERNEL: row copy                         */
	/***********************************************************/

  printf("*** KERNEL: row copy  ***\n");
	// tranpose gmem

	iStart = seconds();
	copyRow<<<grid, block>>>(d_AT, d_A, nrows, ncols);
	CHECK(cudaDeviceSynchronize());
	iElaps = seconds() - iStart;

	// check result
  CHECK(cudaMemcpy(AT_h, d_AT, nBytes, cudaMemcpyDeviceToHost));
	checkResult(A_h, B_h, nrows, ncols);

	ibnd = 2 * ncells * sizeof(float) / 1e9 / iElaps;
	printf("row copy elapsed %f sec\n <<< grid (%d,%d) block (%d,%d)>>> "
			"effective bandwidth %f GB\n\n", iElaps, grid.x, grid.y, block.x,	block.y, ibnd);

  /***********************************************************/
	/*                KERNEL: SMEM copy                        */
	/***********************************************************/

	printf("*** KERNEL: transposeSmem ***\n");
	// tranpose smem


	// TODO

	// free host and device memory

	return EXIT_SUCCESS;
}

void initialData(float *in, const int size) {
	for (int i = 0; i < size; i++)
		in[i] = i; // (float)(rand()/INT_MAX) * 10.0f;
	return;
}

void printData(float *in, int nrows, int ncols) {
	for (int i = 0; i < nrows; i++) {
		for (int j = 0; j < ncols; j++)
			printf("%3.0f ", in[INDEX(i, j, ncols)]);
		printf("\n");
	}
}

void transposeHost(float *out, float *in, const int nrows, const int ncols) {
	for (int iy = 0; iy < nrows; ++iy)
		for (int ix = 0; ix < ncols; ++ix)
			out[INDEX(ix, iy, nrows)] = in[INDEX(iy, ix, ncols)];
}

void checkResult(float *hostRef, float *gpuRef, int rows, int cols) {
	double epsilon = 1.0E-8;
	bool match = 1;

	for (int i = 0; i < rows; i++) {
		for (int j = 0; j < cols; j++) {
			int index = INDEX(i, j, cols);
			if (abs(hostRef[index] - gpuRef[index]) > epsilon) {
				match = 0;
				printf("different on (%d, %d) (offset=%d) element in "
						"transposed matrix: host %f gpu %f\n", i, j, index,
						hostRef[index], gpuRef[index]);
				break;
			}
		}
		if (!match)
			break;
	}

	if (!match)
		printf("Arrays do not match.\n");
}


↩ **Run...**

In [None]:
!nvcc -arch=sm_75 src/lez5/transposeSMEM.cu -o transposeSMEM
!./transposeSMEM