---
# **LAB 8 - CUDA Streams**
---

# ▶️ CUDA setup

In [None]:
!nvcc --version

In [None]:
!nvidia-smi

GPU computing notebooks download (from github)

In [None]:
!git clone https://github.com/giulianogrossi/GPUcomputing.git

NVCC Plugin for Jupyter notebook

In [None]:
%cd GPUcomputing/utils/nvcc4jupyter-master/
!!python3 -m build
%load_ext nvcc4jupyter
%cd /content/

# ✅ Somma array con stream

In [None]:
%%cuda_group_save --name "sumArrayStream.cu" --group "lez8"
#include "/content/GPUcomputing/utils/common.h"


#define NSTREAM 8
#define BDIM 1024

// function prototypes
void initialData(float *ip, int size);
void sumArraysOnHost(float *A, float *B, float *C, const int N);
void checkResult(float *hostRef, float *gpuRef, const int N);

/**
 * Kernel to add the N elements of two arrays
 * C = A + B
 */
__global__ void sumArrays(float *A, float *B, float *C, const int N) {
  int idx = blockIdx.x * blockDim.x + threadIdx.x;

  if (idx < N)
    C[idx] = A[idx] + B[idx];
}

/**
 * Main program
 */
int main(int argc, char **argv) {
  printf("Starting...\n");

  // set up data size of vectors
  int nElem = 1 << 26;
  printf("   vector size = %d\n", nElem);
  size_t nBytes = nElem * sizeof(float);
  printf ("   with streams = %d\n", NSTREAM);

  // malloc pinned host memory for async memcpy
  float *h_A, *h_B, *hostRef, *gpuRef;
  CHECK(cudaHostAlloc((void**)&h_A, nBytes, cudaHostAllocDefault));
  CHECK(cudaHostAlloc((void**)&h_B, nBytes, cudaHostAllocDefault));
  CHECK(cudaHostAlloc((void**)&gpuRef, nBytes, cudaHostAllocDefault));
  CHECK(cudaHostAlloc((void**)&hostRef, nBytes, cudaHostAllocDefault));

  // initialize data at host side
  initialData(h_A, nElem);
  initialData(h_B, nElem);
  memset(hostRef, 0, nBytes);
  memset(gpuRef,  0, nBytes);

  /***************************************************
  *                  Host side                       *
  ****************************************************/
  printf("\nHost compute...\n");
  double start = seconds();
  sumArraysOnHost(h_A, h_B, hostRef, nElem);
  double cpu_time = seconds() - start;
  printf("   CPU elapsed time: %.5f (sec)\n", cpu_time);

  // malloc device global memory
  float *d_A, *d_B, *d_C;
  CHECK(cudaMalloc(&d_A, nBytes));
  CHECK(cudaMalloc(&d_B, nBytes));
  CHECK(cudaMalloc(&d_C, nBytes));

  /***************************************************
  *                Default stream                    *
  ****************************************************/
  printf("\nDefault stream...\n");
  dim3 block (BDIM);
  dim3 grid  ((nElem + block.x - 1) / block.x);

  // mem copy data from host to device & run kernel
  start = seconds();
  CHECK(cudaMemcpy(d_A, h_A, nBytes, cudaMemcpyHostToDevice));
  CHECK(cudaMemcpy(d_B, h_B, nBytes, cudaMemcpyHostToDevice));
  sumArrays<<<grid, block>>>(d_A, d_B, d_C, nElem);
  CHECK(cudaMemcpy(gpuRef, d_C, nBytes, cudaMemcpyDeviceToHost));
  CHECK(cudaDeviceSynchronize());
  double gpu_time = seconds() - start;
  printf("   GPU elapsed time (default stream): %.5f (sec)- speedup %.1f\n", gpu_time, cpu_time/gpu_time);

  /***************************************************
  *                 multi-stream                     *
  ****************************************************/
  printf("\nMulti stream...\n");
  int iElem = nElem / NSTREAM;
  size_t iBytes = iElem * sizeof(float);
  grid.x = (iElem + block.x - 1) / block.x;

  // create streams
  start = seconds();
  cudaStream_t stream[NSTREAM];
  for (int i = 0; i < NSTREAM; ++i)
    CHECK(cudaStreamCreate(&stream[i]));

  // initiate all asynchronous transfers to the device
  for (int i = 0; i < NSTREAM; ++i) {
  int ioffset = i * iElem;
  CHECK(cudaMemcpyAsync(&d_A[ioffset], &h_A[ioffset], iBytes, cudaMemcpyHostToDevice, stream[i]));
  CHECK(cudaMemcpyAsync(&d_B[ioffset], &h_B[ioffset], iBytes, cudaMemcpyHostToDevice, stream[i]));

  // launch a kernel in each stream
  sumArrays<<<grid, block, 0, stream[i]>>>(&d_A[ioffset], &d_B[ioffset], &d_C[ioffset], iElem);

  // enqueue asynchronous transfers from the device
  CHECK(cudaMemcpyAsync(&gpuRef[ioffset], &d_C[ioffset], iBytes, cudaMemcpyDeviceToHost, stream[i]));
  }
  CHECK(cudaDeviceSynchronize());
  gpu_time = seconds() - start;
  printf("   GPU elapsed time (multi-stream): %.5f (sec)- speedup %.1f\n", gpu_time, cpu_time/gpu_time);

  // check device results
  checkResult(hostRef, gpuRef, nElem);

  // free device global memory
  CHECK(cudaFree(d_A));
  CHECK(cudaFree(d_B));
  CHECK(cudaFree(d_C));

  // free host memory
  CHECK(cudaFreeHost(h_A));
  CHECK(cudaFreeHost(h_B));
  CHECK(cudaFreeHost(hostRef));
  CHECK(cudaFreeHost(gpuRef));

  // destroy streams
  for (int i = 0; i < NSTREAM; ++i)
    CHECK(cudaStreamDestroy(stream[i]));

  CHECK(cudaDeviceReset());
  return(0);
}

void initialData(float *ip, int size) {
   int i;

   for(i = 0; i < size; i++)
     ip[i] = (float)(rand() & 0xFF) / 10.0f;
 }

 void sumArraysOnHost(float *A, float *B, float *C, const int N) {
   for (int idx = 0; idx < N; idx++)
     C[idx] = A[idx] + B[idx];
 }

void checkResult(float *hostRef, float *gpuRef, const int N) {
   double epsilon = 1.0E-8;
   bool match = 1;

   for (int i = 0; i < N; i++) {
     if (abs(hostRef[i] - gpuRef[i]) > epsilon) {
       match = 0;
       printf("Arrays do not match!\n");
       printf("host %5.2f gpu %5.2f at %d\n", hostRef[i], gpuRef[i], i);
       break;
     }
   }
   if (match)
     printf("Arrays match.\n\n");
 }

↩ **Run...**

In [None]:
!nvcc -arch=sm_75  src/lez8/sumArrayStream.cu  -o sumArray
!./sumArray

# ✅ Tabular


↘️ **TODO...**

1. Come modificare il kernel per usare gli stream
2. Gestione della memoria pinned e device

Applicare
3. Schema: loop over {copy, kernel, copy}
4. Schema: loop over {copy H2D}, loop over {kernel}, loop over {copy D2H}


In [None]:
%%cuda_group_save --name "tabular.cu" --group "lez8"

#include <stdio.h>
#include "/content/GPUcomputing/utils/common.h"

#define PI 3.141592f

/*
 * Kernel: tabular function
 */
__global__ void tabular(float *a, int n) {
	int i = threadIdx.x + blockIdx.x * blockDim.x;
	if (i < n) {
		float x = PI * (float)i / (float)n;
		float s = sinf(x);
		float c = cosf(x);
		a[i] = sqrtf(abs(s * s - c * c));
	}
}

/*
 * Kernel: tabular function using streams
 */

	// TODO

/*
 * Error measure
 */
float maxError(float *a, int n) {
	float maxE = 0;
	for (int i = 0; i < n; i++) {
		float error = fabs(a[i] - 1.0f);
		if (error > maxE)
			maxE = error;
	}
	return maxE;
}

/*
 * Main: tabular function
 */
int main(void) {

  // main params
  uint MB = 1024*1024;
  uint n = 256*MB;
	int blockSize = 256;
	int nStreams = 8;

	int streamSize = n / nStreams;
	int streamBytes = streamSize * sizeof(float);
	int bytes = n * sizeof(float);

	int devId = 0;
	cudaDeviceProp prop;
	CHECK(cudaGetDeviceProperties(&prop, devId));
	printf("Device : %s\n\n", prop.name);
	CHECK(cudaSetDevice(devId));
  printf("Array size   : %d\n", n);
  printf("StreamSize   : %d\n", streamSize);
  printf("Memory bytes : %d (MB)\n", bytes/MB);
  printf("streamBytes  : %d (MB)\n", streamBytes/MB);

	// allocate pinned host memory and device memory
	float *a, *d_a;
	CHECK(cudaMallocHost((void**) &a, bytes));      // host pinned
	CHECK(cudaMalloc((void**) &d_a, bytes));        // device

	float ms; // elapsed time in milliseconds

	// create events and streams
	cudaEvent_t startEvent, stopEvent, dummyEvent;
	cudaStream_t stream[nStreams];
	CHECK(cudaEventCreate(&startEvent));
	CHECK(cudaEventCreate(&stopEvent));
	CHECK(cudaEventCreate(&dummyEvent));
	for (int i = 0; i < nStreams; ++i)
		CHECK(cudaStreamCreate(&stream[i]));

	// baseline case - sequential transfer and execute
	memset(a, 0, bytes);
	CHECK(cudaEventRecord(startEvent, 0));
	CHECK(cudaMemcpy(d_a, a, bytes, cudaMemcpyHostToDevice));
	tabular<<<n / blockSize, blockSize>>>(d_a, n);
	CHECK(cudaMemcpy(a, d_a, bytes, cudaMemcpyDeviceToHost));
	CHECK(cudaEventRecord(stopEvent, 0));
	CHECK(cudaEventSynchronize(stopEvent));
	CHECK(cudaEventElapsedTime(&ms, startEvent, stopEvent));
	printf("\nTime for sequential transfer and execute (ms): %f\n", ms);
	printf("  max error: %e\n", maxError(a, n));

	// asynchronous version 1: loop over {copy, kernel, copy}


	// asynchronous version 2: loop over copy, loop over kernel, loop over copy

	// cleanup
	CHECK(cudaEventDestroy(startEvent));
	CHECK(cudaEventDestroy(stopEvent));
	CHECK(cudaEventDestroy(dummyEvent));
	for (int i = 0; i < nStreams; ++i)
		CHECK(cudaStreamDestroy(stream[i]));
	cudaFree(d_a);
	cudaFreeHost(a);

	return 0;
}


↩ **Run...**

In [None]:

!nvcc -arch=sm_75 src/lez8/tabular.cu  -o tabular
!./tabular

# ✅ Convoluzione con stream


PPM filtering by 2D convolution...

In [None]:
%%cuda_group_save --name "ppm_conv2D.cu" --group "lez8"
#include "/content/GPUcomputing/utils/common.h"
#include "ppm.h"

#define BLOCK_SIZE   32
#define MASK_SIZE    21
#define TILE_SIZE    (BLOCK_SIZE + MASK_SIZE - 1)

typedef struct {
   int width;
   int height;
   float* elements;
 } Matrix;

 /*
  * 2D convolution using shared memory
  *   A: input matrix
  *   B: output matrix
  *   M: convolution mask matrix
 */
__global__ void conv2D(Matrix A, Matrix B, Matrix M) {

   int x = blockIdx.x * blockDim.x + threadIdx.x; // Column index of matrix A
   int y = blockIdx.y * blockDim.y + threadIdx.y; // Row index of matrix A

   int tile_size = BLOCK_SIZE + MASK_SIZE - 1;
   int radius = MASK_SIZE / 2;

   // Allocate shared memory
   __shared__ float smem[TILE_SIZE][TILE_SIZE];

   // Load data into shared memory
   for (int row = 0; row <= tile_size/blockDim.y; row++) {
      for (int col = 0; col <= tile_size/blockDim.x; col++) {
         int row_data = y + blockDim.y * row - radius;   // input data index row
         int col_data = x + blockDim.x * col - radius;   // input data index column
         int row_smem = threadIdx.y + blockDim.y * row;  // mask index row
         int col_smem = threadIdx.x + blockDim.x * col;  // mask index column

         // Check valid range for smem and data
         if (row_smem < tile_size && col_smem < tile_size) {
            if (row_data >= 0 && row_data < A.height && col_data >= 0 && col_data < A.width) {
               smem[row_smem][col_smem] = A.elements[row_data * A.width + col_data];
            } else {
               smem[row_smem][col_smem] = 0.0f;
            }
         }
      }
   }

   // Synchronize threads
   __syncthreads();

   // Apply convolution
   float sum = 0.0f;
   for (int i = 0; i < MASK_SIZE; i++) {
      for (int j = 0; j < MASK_SIZE; j++) {
         int r = threadIdx.y + i;
         int c = threadIdx.x + j;
         if (r >= 0 && r < tile_size && c >= 0 && c < tile_size) {
            sum += smem[r][c] * M.elements[i * MASK_SIZE + j];
         }
      }
   }

   // Write output
   if (y < A.height && x < A.width) {
      B.elements[y * B.width + x] = sum;
   }
}

/*
 * Main function
 */
int main(void) {
   // Load image
   char path[] = "GPUcomputing/images/dog.ppm";
   PPM *img = ppm_load(path);
   int WIDTH = img->width;
   int HEIGHT = img->height;
   printf("PPM image size (w x h): %d x %d\n", WIDTH, HEIGHT);

   // extract channels and set matrices
   Matrix R, G, B;
   R.width = WIDTH; R.height = HEIGHT;
   G.width = WIDTH; G.height = HEIGHT;
   B.width = WIDTH; B.height = HEIGHT;
   R.elements = (float *) malloc(WIDTH * HEIGHT * sizeof(float));
   G.elements = (float *) malloc(WIDTH * HEIGHT * sizeof(float));
   B.elements = (float *) malloc(WIDTH * HEIGHT * sizeof(float));
   color *r = ppm_extract_channel(img, 0); // get red channel
   color *g = ppm_extract_channel(img, 1); // get green channel
   color *b = ppm_extract_channel(img, 2); // get blue channel
   for (int i = 0; i < WIDTH * HEIGHT; i++) {
      R.elements[i] = (float) r[i];
      G.elements[i] = (float) g[i];
      B.elements[i] = (float) b[i];
   }

   // get gaussian filter mask
   float SIGMA = 10.0;
   Matrix M;
   M.width = WIDTH; M.height = HEIGHT;
   M.elements = gaussMask(MASK_SIZE, SIGMA);

   // Allocate device memory
   Matrix d_R, d_G, d_B, d_M;
   d_R.width = R.width; d_R.height = R.height;
   d_G.width = G.width; d_G.height = G.height;
   d_B.width = B.width; d_B.height = B.height;
   d_M.width = M.width; d_M.height = M.height;
   CHECK(cudaMalloc(&d_R.elements, R.width * R.height * sizeof(float)));
   CHECK(cudaMalloc(&d_G.elements, G.width * G.height * sizeof(float)));
   CHECK(cudaMalloc(&d_B.elements, B.width * B.height * sizeof(float)));
   CHECK(cudaMalloc(&d_M.elements, M.width * M.height * sizeof(float)));

   /***********************************************************/
	/*                    conv2D on host                       */
	/***********************************************************/
   printf("\nCPU procedure...\n");
	double start = seconds();
   PPM *img_filtered = ppm_make(WIDTH, HEIGHT, (pel) {0,0,0}); // create a new image
   ppm_gaussFilter(img, img_filtered, MASK_SIZE, SIGMA);
   ppm_write(img_filtered, "output_gaussian.ppm");
	double stopCPU = seconds() - start;
   printf("   Host elapsed time: %f\n", stopCPU);

   /***********************************************************/
	/*                  GPU conv2D wih smem                    */
	/***********************************************************/
   printf("\nGPU conv2D with smem...\n");
   dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);
   dim3 dimGrid((WIDTH + BLOCK_SIZE - 1) / BLOCK_SIZE, (HEIGHT + BLOCK_SIZE - 1) / BLOCK_SIZE);

   start = seconds();

   // Copy data to device
   CHECK(cudaMemcpy(d_R.elements, R.elements, R.width * R.height * sizeof(float), cudaMemcpyHostToDevice));
   CHECK(cudaMemcpy(d_G.elements, G.elements, G.width * G.height * sizeof(float), cudaMemcpyHostToDevice));
   CHECK(cudaMemcpy(d_B.elements, B.elements, B.width * B.height * sizeof(float), cudaMemcpyHostToDevice));
   CHECK(cudaMemcpy(d_M.elements, M.elements, M.width * M.height * sizeof(float), cudaMemcpyHostToDevice));

   // invoke kernels
   conv2D<<<dimGrid, dimBlock>>>(d_R, d_R, d_M);
   conv2D<<<dimGrid, dimBlock>>>(d_G, d_G, d_M);
   conv2D<<<dimGrid, dimBlock>>>(d_B, d_B, d_M);

   // Copy data back to host
   CHECK(cudaMemcpy(R.elements, d_R.elements, R.width * R.height * sizeof(float), cudaMemcpyDeviceToHost));
   CHECK(cudaMemcpy(G.elements, d_G.elements, G.width * G.height * sizeof(float), cudaMemcpyDeviceToHost));
   CHECK(cudaMemcpy(B.elements, d_B.elements, B.width * B.height * sizeof(float), cudaMemcpyDeviceToHost));

   // print elapsed time: H2D + kernel + D2H
   double stopGPU = seconds() - start;
   printf("    Elapsed time: %f (sec) - speedup %.1f\n", stopGPU, stopCPU / stopGPU);

   // Copy channels to new image
   for (int i = 0; i < WIDTH * HEIGHT; i++) {
      r[i] = (color) R.elements[i];
      g[i] = (color) G.elements[i];
      b[i] = (color) B.elements[i];
   }

   // build new image
   PPM *ppm_filtered = ppm_combine_channels(r, g, b, WIDTH, HEIGHT);
   ppm_write(img_filtered, "output_gaussianGPU.ppm");

   return 0;
}




↩ **Run...**

In [None]:
!nvcc -arch=sm_75 src/lez8/ppm_conv2D.cu  -o ppm_conv2D -I /content/GPUcomputing/utils/PPM /content/GPUcomputing/utils/PPM/ppm.cpp
!./ppm_conv2D

PPM filtering by 2D convolution with streams...

↘️ **TODO...**

Passi da fare:
 1. CUDA stream creation
 2. Async copy of data to device
 3. invoke kernels on streams
 4. Copy data back to host
 5. Wait for all streams to finish
 6. Free device memory destring streams


In [None]:
%%cuda_group_save --name "ppm_conv2D_stream.cu" --group "lez8"
#include "/content/GPUcomputing/utils/common.h"
#include "ppm.h"

#define BLOCK_SIZE   32
#define MASK_SIZE    21
#define TILE_SIZE    (BLOCK_SIZE + MASK_SIZE - 1)

typedef struct {
   int width;
   int height;
   float* elements;
 } Matrix;

 /*
  * 2D convolution using shared memory
  *   A: input matrix
  *   B: output matrix
  *   M: convolution mask matrix
 */
__global__ void conv2D(Matrix A, Matrix B, Matrix M) {

   int x = blockIdx.x * blockDim.x + threadIdx.x; // Column index of matrix A
   int y = blockIdx.y * blockDim.y + threadIdx.y; // Row index of matrix A

   int tile_size = BLOCK_SIZE + MASK_SIZE - 1;
   int radius = MASK_SIZE / 2;

   // Allocate shared memory
   __shared__ float smem[TILE_SIZE][TILE_SIZE];

   // Load data into shared memory
   for (int row = 0; row <= tile_size/blockDim.y; row++) {
      for (int col = 0; col <= tile_size/blockDim.x; col++) {
         int row_data = y + blockDim.y * row - radius;   // input data index row
         int col_data = x + blockDim.x * col - radius;   // input data index column
         int row_smem = threadIdx.y + blockDim.y * row;  // mask index row
         int col_smem = threadIdx.x + blockDim.x * col;  // mask index column

         // Check valid range for smem and data
         if (row_smem < tile_size && col_smem < tile_size) {
            if (row_data >= 0 && row_data < A.height && col_data >= 0 && col_data < A.width) {
               smem[row_smem][col_smem] = A.elements[row_data * A.width + col_data];
            } else {
               smem[row_smem][col_smem] = 0.0f;
            }
         }
      }
   }

   // Synchronize threads
   __syncthreads();

   // Apply convolution
   float sum = 0.0f;
   for (int i = 0; i < MASK_SIZE; i++) {
      for (int j = 0; j < MASK_SIZE; j++) {
         int r = threadIdx.y + i;
         int c = threadIdx.x + j;
         if (r >= 0 && r < tile_size && c >= 0 && c < tile_size) {
            sum += smem[r][c] * M.elements[i * MASK_SIZE + j];
         }
      }
   }

   // Write output
   if (y < A.height && x < A.width) {
      B.elements[y * B.width + x] = sum;
   }
}

/*
 * Main function
 */
int main(void) {
   // Load image
   char path[] = "GPUcomputing/images/dog.ppm";
   PPM *img = ppm_load(path);
   int WIDTH = img->width;
   int HEIGHT = img->height;
   printf("PPM image size (w x h): %d x %d\n", WIDTH, HEIGHT);

   // extract channels and set matrices
   Matrix R, G, B;
   R.width = WIDTH; R.height = HEIGHT;
   G.width = WIDTH; G.height = HEIGHT;
   B.width = WIDTH; B.height = HEIGHT;
   R.elements = (float *) malloc(WIDTH * HEIGHT * sizeof(float));
   G.elements = (float *) malloc(WIDTH * HEIGHT * sizeof(float));
   B.elements = (float *) malloc(WIDTH * HEIGHT * sizeof(float));
   color *r = ppm_extract_channel(img, 0); // get red channel
   color *g = ppm_extract_channel(img, 1); // get green channel
   color *b = ppm_extract_channel(img, 2); // get blue channel
   for (int i = 0; i < WIDTH * HEIGHT; i++) {
      R.elements[i] = (float) r[i];
      G.elements[i] = (float) g[i];
      B.elements[i] = (float) b[i];
   }

   // get gaussian filter mask
   float SIGMA = 10.0;
   Matrix M;
   M.width = WIDTH; M.height = HEIGHT;
   M.elements = gaussMask(MASK_SIZE, SIGMA);

   // Allocate device memory
   Matrix d_R, d_G, d_B, d_M;
   d_R.width = R.width; d_R.height = R.height;
   d_G.width = G.width; d_G.height = G.height;
   d_B.width = B.width; d_B.height = B.height;
   d_M.width = M.width; d_M.height = M.height;
   CHECK(cudaMalloc(&d_R.elements, R.width * R.height * sizeof(float)));
   CHECK(cudaMalloc(&d_G.elements, G.width * G.height * sizeof(float)));
   CHECK(cudaMalloc(&d_B.elements, B.width * B.height * sizeof(float)));
   CHECK(cudaMalloc(&d_M.elements, M.width * M.height * sizeof(float)));

   // Copy mask to device
   CHECK(cudaMemcpy(d_M.elements, M.elements, M.width * M.height * sizeof(float), cudaMemcpyHostToDevice));

   /***********************************************************/
	/*                    conv2D on host                       */
	/***********************************************************/
   printf("\nCPU procedure...\n");
	double start = seconds();
   PPM *img_filtered = ppm_make(WIDTH, HEIGHT, (pel) {0,0,0}); // create a new image
   ppm_gaussFilter(img, img_filtered, MASK_SIZE, SIGMA);
   ppm_write(img_filtered, "output_gaussian.ppm");
	double stopCPU = seconds() - start;
   printf("   Host elapsed time: %f\n", stopCPU);

   /***********************************************************/
	/*                GPU conv2D with streams                  */
	/***********************************************************/
   printf("\nGPU conv2D with smem...\n");
   const int C = 3; // RGB channels
   dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);
   dim3 dimGrid((WIDTH + BLOCK_SIZE - 1) / BLOCK_SIZE, (HEIGHT + BLOCK_SIZE - 1) / BLOCK_SIZE);

   start = seconds();

   // CUDA stream creation

   // Async copy of data to device

   // invoke kernels on streams

   // Copy data back to host

   // Wait for all streams to finish

   // Free device memory destring streams

   // print elapsed time: H2D + kernel + D2H
   double stopGPU = seconds() - start;
   printf("    Elapsed time: %f (sec) - speedup %.1f\n", stopGPU, stopCPU / stopGPU);

   // Copy channels to new image
   for (int i = 0; i < WIDTH * HEIGHT; i++) {
      r[i] = (color) R.elements[i];
      g[i] = (color) G.elements[i];
      b[i] = (color) B.elements[i];
   }

   // build new image
   PPM *ppm_filtered = ppm_combine_channels(r, g, b, WIDTH, HEIGHT);
   ppm_write(img_filtered, "output_gaussianGPU.ppm");

   return 0;
}




↩ **Run...**

In [None]:
!nvcc -arch=sm_75 src/lez8/ppm_conv2D_stream.cu  -o ppm_conv2D_stream -I /content/GPUcomputing/utils/PPM /content/GPUcomputing/utils/PPM/ppm.cpp
!./ppm_conv2D_stream

# ✅ Kernel per immagini PPM con stream ed eventi


↘️ **TODO...**

Passi da fare:

 1. create an event on channel G for synchronization
 2. CUDA stream creation (one for each channel)
 3. Async copy of data to device  
 4. invoke kernels on streams
 5. histogram on streams
 6. normalize histogram on stream 0
 7. record event on stream 0
 8. Wait for the second stream to finish before starting the other streams
 9. equalize on streams
 10. Copy data back to host to R, G, B matrices
 11. Wait for all streams to finish
 12. Free device memory destring streams

In [None]:
%%cuda_group_save --name "ppm_conv2D_stream_event.cu" --group "lez8"
#include "/content/GPUcomputing/utils/common.h"
#include "kernels.h"

/*
 * Main function
 */
int main(void) {
   // Load image
   char path[] = "GPUcomputing/images/dog.ppm";
   PPM *img = ppm_load(path);
   int WIDTH = img->width;
   int HEIGHT = img->height;
   printf("PPM image size (w x h): %d x %d\n", WIDTH, HEIGHT);

   // extract channels and set matrices
   Matrix R, G, B;
   R.width = WIDTH; R.height = HEIGHT;
   G.width = WIDTH; G.height = HEIGHT;
   B.width = WIDTH; B.height = HEIGHT;
   R.elements = (float *) malloc(WIDTH * HEIGHT * sizeof(float));
   G.elements = (float *) malloc(WIDTH * HEIGHT * sizeof(float));
   B.elements = (float *) malloc(WIDTH * HEIGHT * sizeof(float));
   color *r = ppm_extract_channel(img, 0); // get red channel
   color *g = ppm_extract_channel(img, 1); // get green channel
   color *b = ppm_extract_channel(img, 2); // get blue channel
   for (int i = 0; i < WIDTH * HEIGHT; i++) {
      R.elements[i] = (float) r[i];
      G.elements[i] = (float) g[i];
      B.elements[i] = (float) b[i];
   }

   // Gaussian filter masks
   float SIGMA = 10.0;
   Matrix M;
   M.width = MASK_SIZE;
   M.height = MASK_SIZE;
   M.elements = gaussMask(MASK_SIZE, SIGMA);

   // Allocate device memory for the RGB channels and mask
   Matrix d_R, d_G, d_B, d_R1, d_G1, d_B1, d_M;
   d_R.width = R.width; d_R.height = R.height;
   d_G.width = G.width; d_G.height = G.height;
   d_B.width = B.width; d_B.height = B.height;
   d_M.width = M.width; d_M.height = M.height;
   CHECK(cudaMalloc(&d_R.elements, R.width * R.height * sizeof(float)));
   CHECK(cudaMalloc(&d_G.elements, G.width * G.height * sizeof(float)));
   CHECK(cudaMalloc(&d_B.elements, B.width * B.height * sizeof(float)));
   CHECK(cudaMalloc(&d_M.elements, M.width * M.height * sizeof(float)));
   CHECK(cudaMalloc(&d_R1.elements, R.width * R.height * sizeof(float)));
   CHECK(cudaMalloc(&d_G1.elements, G.width * G.height * sizeof(float)));
   CHECK(cudaMalloc(&d_B1.elements, B.width * B.height * sizeof(float)));

   // allocate memory for the histograms on the device
   float *histogram_R, *histogram_G, *histogram_B;
   int nBins = 256 * sizeof(float);
   CHECK(cudaMalloc(&histogram_R, nBins));
   CHECK(cudaMemset(histogram_R, 0, nBins));
   CHECK(cudaMalloc(&histogram_G, nBins));
   CHECK(cudaMemset(histogram_G, 0, nBins));
   CHECK(cudaMalloc(&histogram_B, nBins));
   CHECK(cudaMemset(histogram_B, 0, nBins));

   // Copy mask to device
   CHECK(cudaMemcpy(d_M.elements, M.elements, M.width * M.height * sizeof(float), cudaMemcpyHostToDevice));

   /***********************************************************/
	/*   PPM filtering and equaliza with streams and events    */
	/***********************************************************/
   printf("\nFiltering PPM image with multiple kernels...\n");
   const int C = 3; // RGB channels
   dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);
   dim3 dimGrid((WIDTH + BLOCK_SIZE - 1) / BLOCK_SIZE, (HEIGHT + BLOCK_SIZE - 1) / BLOCK_SIZE);
   dim3 dimBlock1(512);
   dim3 dimGrid1((WIDTH * HEIGHT + dimBlock1.x - 1) / dimBlock1.x);

   // create an event on channel G for synchronization


   // CUDA stream creation (one for each channel)


   // Async copy of data to device


   // invoke kernels on streams


   // histogram on streams


   // normalize histogram on stream 0


   // record event on stream 0


   // Wait for the second stream to finish before starting the other streams


   // equalize on streams


   // Copy data back to host to R, G, B matrices


   // Wait for all streams to finish


   // Free device memory destring streams


   // Copy channels to new image
   for (int i = 0; i < WIDTH * HEIGHT; i++) {
      r[i] = (color) R.elements[i];
      g[i] = (color) G.elements[i];
      b[i] = (color) B.elements[i];
   }

   // build new image
   PPM *ppm_filtered = ppm_combine_channels(r, g, b, WIDTH, HEIGHT);
   ppm_write(ppm_filtered, "outputGPU.ppm");

   printf("\nImage saved: outputGPU.ppm\n");

   return 0;
}




↩ **Run...**

In [None]:
!nvcc -arch=sm_75 src/lez8/ppm_conv2D_stream_event.cu -o ppm_conv2D_stream_event -I /content/GPUcomputing/utils/PPM  /content/GPUcomputing/utils/PPM/kernels.cu /content/GPUcomputing/utils/PPM/ppm.cpp
!./ppm_conv2D_stream_event

# ✅ MQDB con stream

↘️ **TODO...**

- Disegnare un kernel per il prodotto tra matrici MQDB con le seguenti specifiche:
- Allocare spazio per matrici MQDB su CPU e GPU
- Confrontare uso di memoria unificata vs memoria asincrona
- Introdurre gli stream su cui distribuire il carico (grid parall.)
- Analisi di prestazioni usando i tempi ricavati con CUDA event

In [None]:
%%cuda_group_save --name "MQDB_stream_Unified.cu" --group "lez8"

#include "/content/GPUcomputing/utils/MQDB/mqdb.h"
#include "/content/GPUcomputing/utils/common.h"

#define BLOCK_SIZE 16     // block size
#define TEST_CPU 0

/*
 * Kernel for standard (naive) matrix product
 */
__global__ void matProdKernel(mqdb *A, mqdb *B, mqdb *C, int n) {
	// row & col indexes
	int row = blockIdx.y * blockDim.y + threadIdx.y;
	int col = blockIdx.x * blockDim.x + threadIdx.x;

	// each thread computes an entry of the product matrix
	if ((row < n) && (col < n)) {
		float val = 0;
		for (int k = 0; k < n; k++)
			val += A->elem[row * n + k] * B->elem[k * n + col];
		C->elem[row * n + col] = val;
	}
}

/*
 * Kernel for block sub-matrix product of mqdb
 */
__global__ void mqdbBlockProd(mqdb *A, mqdb *B, mqdb *C, uint sdim, uint d, uint n) {
	int row = blockIdx.y * blockDim.y + threadIdx.y;
	int col = blockIdx.x * blockDim.x + threadIdx.x;

	// jump to the right block sub-matrix
	uint  offset = (n+1)*sdim;

	// each thread computes an entry of the product matrix
	if ((row < d) && (col < d)) {
		float val = 0;
		for (int k = 0; k < d; k++)
			val += A->elem[row * n + k + offset] * B->elem[k * n + col + offset];
		C->elem[row * n + col + offset] = val;
	}
}


/*
 * Test on MQDB kernels using Unified Memory
 */
void testKernelsMQDB_unified(uint n, uint k, cudaEvent_t start, cudaEvent_t stop) {

	// matrix instance generation - Unified Memory
	mqdb *A, *B, *C;
	CHECK(cudaMallocManaged(&A, sizeof(mqdb)));
  CHECK(cudaMallocManaged(&A->blkSize, k*sizeof(int)));
  CHECK(cudaMallocManaged(&A->elem, n*n*sizeof(float)));

  CHECK(cudaMallocManaged(&B, sizeof(mqdb)));
  CHECK(cudaMallocManaged(&B->blkSize, k*sizeof(int)));
  CHECK(cudaMallocManaged(&B->elem, n*n*sizeof(float)));

  CHECK(cudaMallocManaged(&C, sizeof(mqdb)));
  CHECK(cudaMallocManaged(&C->blkSize, k*sizeof(int)));
  CHECK(cudaMallocManaged(&C->elem, n*n*sizeof(float)));

  // random fill mat entries
  int seed = 1;
	genRandDimsUnified(A, n, k, seed);
	genRandDimsUnified(B, n, k, seed);
	genRandDimsUnified(C, n, k, seed);
	fillBlocksUnified(A, n, k, 'C', 1);
	fillBlocksUnified(B, n, k, 'C', 2);
	fillBlocksUnified(C, n, k, 'C', 0);

	ulong nBytes = n * n * sizeof(float);
	printf("Memory size required = %3.4f (MB)\n",(float)nBytes/(1024.0*1024.0));


	/***********************************************************/
	/*                     GPU mat product                     */
	/***********************************************************/

  printf("Kernel (naive) mat product...\n");
	dim3 block(BLOCK_SIZE, BLOCK_SIZE);
	dim3 grid((n + block.x - 1) / block.x, (n + block.y - 1) / block.y);
  float milliseconds;
	CHECK(cudaEventRecord(start));
	matProdKernel<<<grid, block>>>(A, B, C, n);
  CHECK(cudaDeviceSynchronize());
	CHECK(cudaEventRecord(stop));
	CHECK(cudaEventSynchronize(stop));
	CHECK(cudaEventElapsedTime(&milliseconds, start, stop));
	float GPUtime1 = milliseconds / 1000.0;
	printf("   elapsed time               : %.4f (sec)\n", GPUtime1);
	//mqdbDisplay(C);

	/***********************************************************/
	/*                     GPU MQDB product                    */
	/***********************************************************/

  printf("Kernel MQDB product...\n");
	uint sdim = 0;
	CHECK(cudaEventRecord(start));
	for (uint i = 0; i < k; i++ ) {
		uint d = A->blkSize[i];
		mqdbBlockProd<<<grid, block>>>(A, B, C, sdim, d, n);
		sdim += d;
	}
	CHECK(cudaDeviceSynchronize());
	CHECK(cudaEventRecord(stop));
	CHECK(cudaEventSynchronize(stop));
	CHECK(cudaEventElapsedTime(&milliseconds, start, stop));
	float GPUtime2 = milliseconds / 1000.0;
	printf("   elapsed time                  :  %.4f (sec)\n", GPUtime2);
	printf("   speedup vs GPU std mat product:  %.4f\n\n", GPUtime1/GPUtime2);

  /***********************************************************/
	/*             GPU MQDB product using streams              */
	/***********************************************************/

  printf("Kernel MQDB product using streams...\n");

  // TODO

	float GPUtime3 = milliseconds / 1000.0;
	printf("   elapsed time                  : %.5f (sec)\n", GPUtime3);
	printf("   speedup vs GPU std mat product: %.2f\n",GPUtime1/GPUtime3);
  printf("   speedup vs GPU MQDB product   : %.2f\n",GPUtime2/GPUtime3);
  //mqdbDisplay(C);

	// clean up streams and events
	for (int i = 0; i < nstreams; i++)
		cudaStreamDestroy(streams[i]);

}

/*
 * main function
 */
int main(int argc, char *argv[]) {

  // set up device
	int dev = 0;
	cudaDeviceProp deviceProp;
	CHECK(cudaGetDeviceProperties(&deviceProp, dev));
	printf("%s starting mqdb product at ", argv[0]);
	printf("device %d: %s\n", dev, deviceProp.name);
	CHECK(cudaSetDevice(dev));

	// events to measure time
	cudaEvent_t start, stop;
	cudaEventCreate(&start);
	cudaEventCreate(&stop);

	uint n = 8*1024;         // matrix size
	uint min_k = 20;         // min num of blocks
	uint max_k = 30;         // max num of blocks

	// multiple tests for k = # diag blocks
	for (uint k = min_k; k <= max_k; k+=5) {
		printf("\n*****   k = %d --- (avg block size = %f)\n",k,(float)n/k);
		testKernelsMQDB_unified(n, k, start, stop);
	}

  cudaEventDestroy(start);
	cudaEventDestroy(stop);
	return 0;
}




↩ **Run...**

In [None]:
!nvcc -arch=sm_75  src/lez8/MQDB_stream_Unified.cu /content/GPUcomputing/utils/MQDB/mqdb.cpp -o MQDBS
!./MQDBS