---
# **LAB 7 - CUDA Streams**
---

# ▶️ CUDA setup

In [None]:
!nvcc --version

In [None]:
!nvidia-smi

GPU computing notebooks download (from github)

In [None]:
!git clone https://github.com/giulianogrossi/GPUcomputing.git

NVCC Plugin for Jupyter notebook

In [None]:
%cd GPUcomputing/utils/nvcc4jupyter-master/
!python3 -m build
%load_ext nvcc4jupyter
%cd /content/

# ▶️ DeviceQuery

In [None]:
# DeviceQuery dell'attuale device (su Colab!)
!nvcc -arch=sm_75 /content/GPUcomputing/utils/deviceQuery.cu -o deviceQuery
!./deviceQuery

# ✅ Streams and priority

Simple stream...

In [None]:
%%cuda

#include <stdio.h>
#include "../../GPUcomputing/utils/common.h"


// helper functions and utilities to work with CUDA
//#include <helper_functions.h>
//#include <helper_cuda.h>

const char *sSDKsample = "simpleStreams";

const char *sEventSyncMethod[] = {"cudaEventDefault",
                                  "cudaEventBlockingSync",
                                  "cudaEventDisableTiming", NULL};

const char *sDeviceSyncMethod[] = {"cudaDeviceScheduleAuto",
                                   "cudaDeviceScheduleSpin",
                                   "cudaDeviceScheduleYield",
                                   "INVALID",
                                   "cudaDeviceScheduleBlockingSync", NULL};

// kernel
__global__ void init_array(int *g_data, int *factor, int num_iterations) {
  int idx = blockIdx.x * blockDim.x + threadIdx.x;

  for (int i = 0; i < num_iterations; i++) {
    g_data[idx] += *factor;  // non-coalesced on purpose, to burn time
  }
}

bool correct_data(int *a, const int n, const int c) {
  for (int i = 0; i < n; i++) {
    if (a[i] != c) {
      printf("%d: %d %d\n", i, a[i], c);
      return false;
    }
  }

  return true;
}

static const char *sSyncMethod[] = {
    "0 (Automatic Blocking)",
    "1 (Spin Blocking)",
    "2 (Yield Blocking)",
    "3 (Undefined Blocking Method)",
    "4 (Blocking Sync Event) = low CPU utilization",
    NULL};

void printHelp() {
  printf("Usage: %s [options below]\n", sSDKsample);
  printf("\t--sync_method=n for CPU/GPU synchronization\n");
  printf("\t             n=%s\n", sSyncMethod[0]);
  printf("\t             n=%s\n", sSyncMethod[1]);
  printf("\t             n=%s\n", sSyncMethod[2]);
  printf("\t   <Default> n=%s\n", sSyncMethod[4]);
  printf(
      "\t--use_generic_memory (default) use generic page-aligned for system "
      "memory\n");
  printf(
      "\t--use_cuda_malloc_host (optional) use cudaMallocHost to allocate "
      "system memory\n");
}

/*******************************************************************************/
/*                                  MAIN																			 */
/*******************************************************************************/
int main(int argc, char **argv) {
  int nstreams = 4;              // number of streams for CUDA calls
  int nreps = 10;                // number of times each experiment is repeated
  int n = 16 * 1024 * 1024;      // number of ints in the data set
  int nbytes = n * sizeof(int);  // number of data bytes
  float elapsed_time, time_memcpy, time_kernel;  // timing variables
  int niterations = 5;  // number of iterations for the loop inside the kernel

  // print dev features
  device_feat();

  // allocate host memory
  int c = 5;            // value to which the array will be initialized
  int *h_a = 0;         // pointer to the array data in host memory
  CHECK(cudaMallocHost((void **)&h_a, nbytes));

  // allocate device memory
  int *d_a = 0, *d_c = 0;  // pointers to data and init value in the device memory
  CHECK(cudaMalloc((void **)&d_a, nbytes));
  CHECK(cudaMemset(d_a, 0x0, nbytes));
  CHECK(cudaMalloc((void **)&d_c, sizeof(int)));
  CHECK(cudaMemcpy(d_c, &c, sizeof(int), cudaMemcpyHostToDevice));

  printf("\Operation\ttime\n");
  printf("\---------------------\n");

  // allocate and initialize an array of stream handles
  cudaStream_t *streams = (cudaStream_t *)malloc(nstreams * sizeof(cudaStream_t));

  for (int i = 0; i < nstreams; i++) {
    CHECK(cudaStreamCreate(&(streams[i])));
  }

  // create CUDA event handles use blocking sync
  cudaEvent_t start_event, stop_event;

  CHECK(cudaEventCreateWithFlags(&start_event, cudaEventDefault));
  CHECK(cudaEventCreateWithFlags(&stop_event, cudaEventDefault));

  // time memcopy from device
  CHECK(cudaEventRecord(start_event, 0));  // record in stream-0, to
                                                     // ensure that all previous
                                                     // CUDA calls have
                                                     // completed
  CHECK(cudaMemcpyAsync(h_a, d_a, nbytes, cudaMemcpyDeviceToHost, streams[0]));
  CHECK(cudaEventRecord(stop_event, 0));
  CHECK(cudaEventSynchronize(stop_event));  // block until the event is actually recorded
  CHECK(cudaEventElapsedTime(&time_memcpy, start_event, stop_event));
  printf("memcopy:\t%.2f\n", time_memcpy);

  // time kernel
  dim3 threads = dim3(512, 1);
  dim3 blocks = dim3(n / threads.x, 1);
  CHECK(cudaEventRecord(start_event, 0));
  init_array<<<blocks, threads, 0, streams[0]>>>(d_a, d_c, niterations);
  CHECK(cudaEventRecord(stop_event, 0));
  CHECK(cudaEventSynchronize(stop_event));
  CHECK(cudaEventElapsedTime(&time_kernel, start_event, stop_event));
  printf("kernel:\t\t%.2f\n", time_kernel);

  //********************************************
  // time non-streamed execution for reference
  //********************************************
  threads = dim3(512, 1);
  blocks = dim3(n / threads.x, 1);
  CHECK(cudaEventRecord(start_event, 0));

  for (int k = 0; k < nreps; k++) {
    init_array<<<blocks, threads>>>(d_a, d_c, niterations);
    CHECK(cudaMemcpy(h_a, d_a, nbytes, cudaMemcpyDeviceToHost));
  }

  CHECK(cudaEventRecord(stop_event, 0));
  CHECK(cudaEventSynchronize(stop_event));
  CHECK(cudaEventElapsedTime(&elapsed_time, start_event, stop_event));
  printf("non-streamed:\t%.2f\n", elapsed_time / nreps);

  //********************************************
  // time execution with nstreams streams
  //********************************************
  threads = dim3(512, 1);
  blocks = dim3(n / (nstreams * threads.x), 1);
  memset(h_a, 255, nbytes);           // set host memory bits to all 1s, for testing correctness
  CHECK(cudaMemset(d_a, 0, nbytes));  // set device memory to all 0s, for testing correctness
  CHECK(cudaEventRecord(start_event, 0));

  for (int k = 0; k < nreps; k++) {
    // asynchronously launch nstreams kernels, each operating on its own portion of data
    for (int i = 0; i < nstreams; i++) {
      init_array<<<blocks, threads, 0, streams[i]>>>(d_a + i * n / nstreams, d_c, niterations);
    }

    // asynchronously launch nstreams memcopies.  Note that memcopy in stream x will only
    // commence executing when all previous CUDA calls in stream x have completed
    for (int i = 0; i < nstreams; i++) {
      CHECK(cudaMemcpyAsync(h_a + i * n / nstreams,
                                      d_a + i * n / nstreams, nbytes / nstreams,
                                      cudaMemcpyDeviceToHost, streams[i]));
    }
  }

  CHECK(cudaEventRecord(stop_event, 0));
  CHECK(cudaEventSynchronize(stop_event));
  CHECK(cudaEventElapsedTime(&elapsed_time, start_event, stop_event));
  printf("%d streams:\t%.2f\n", nstreams, elapsed_time / nreps);

  // check whether the output is correct
  printf("---------------------\n");
  bool bResults = correct_data(h_a, n, c * nreps * niterations);

  // release resources
  for (int i = 0; i < nstreams; i++) {
    CHECK(cudaStreamDestroy(streams[i]));
  }

  CHECK(cudaEventDestroy(start_event));
  CHECK(cudaEventDestroy(stop_event));

  // Free cudaMallocHost
  cudaFreeHost(h_a);

  CHECK(cudaFree(d_a));
  CHECK(cudaFree(d_c));

  return bResults ? EXIT_SUCCESS : EXIT_FAILURE;
}

Stream priorities...

In [None]:
%%cuda
#include <cstdio>
#include <stdio.h>
#include "../../GPUcomputing/utils/common.h"

#define TOTAL_SIZE 256 * 1024 * 1024
#define EACH_SIZE 128 * 1024 * 1024

// threadblocks
#define TBLOCKS 1024
#define THREADS 512


// copy from source -> destination arrays
__global__ void memcpy_kernel(int *dst, int *src, size_t n) {
  int num = gridDim.x * blockDim.x;
  int id = blockDim.x * blockIdx.x + threadIdx.x;

  for (int i = id; i < n / sizeof(int); i += num) {
    dst[i] = src[i];
  }
}

// initialise memory
void mem_init(int *buf, size_t n) {
  for (int i = 0; i < n / sizeof(int); i++) {
    buf[i] = i;
  }
}

/******************************************************************************/
/*                                  MAIN																			*/
/******************************************************************************/
int main(int argc, char **argv) {
  cudaDeviceProp device_prop;
  int dev_id;

  printf("Starting...\n");

  // set device
  dev_id = findCudaDevice(argc, (const char **)argv);
  CHECK(cudaGetDeviceProperties(&device_prop, dev_id));

  // get the range of priorities available
  // [ greatest_priority, lowest_priority ]
  int priority_low;
  int priority_hi;
  CHECK(cudaDeviceGetStreamPriorityRange(&priority_low, &priority_hi));

  printf("CUDA stream priority range: LOW: %d to HIGH: %d\n", priority_low, priority_hi);

  // create streams with highest and lowest available priorities
  cudaStream_t st_low;
  cudaStream_t st_hi;
  CHECK(cudaStreamCreateWithPriority(&st_low, cudaStreamNonBlocking, priority_low));
  CHECK(cudaStreamCreateWithPriority(&st_hi, cudaStreamNonBlocking, priority_hi));

  size_t size;
  size = TOTAL_SIZE;

  // initialise host data
  int *h_src_low;
  int *h_src_hi;
  h_src_low = (int *)malloc(size);
  h_src_hi = (int *)malloc(size);
  mem_init(h_src_low, size);
  mem_init(h_src_hi, size);

  // initialise device data
  int *h_dst_low;
  int *h_dst_hi;
  h_dst_low = (int *)malloc(size);
  h_dst_hi = (int *)malloc(size);
  memset(h_dst_low, 0, size);
  memset(h_dst_hi, 0, size);

  // copy source data -> device
  int *d_src_low;
  int *d_src_hi;
  CHECK(cudaMalloc(&d_src_low, size));
  CHECK(cudaMalloc(&d_src_hi, size));
  CHECK(cudaMemcpy(d_src_low, h_src_low, size, cudaMemcpyHostToDevice));
  CHECK(cudaMemcpy(d_src_hi, h_src_hi, size, cudaMemcpyHostToDevice));

  // allocate memory for memcopy destination
  int *d_dst_low;
  int *d_dst_hi;
  CHECK(cudaMalloc(&d_dst_low, size));
  CHECK(cudaMalloc(&d_dst_hi, size));

  // create some events
  cudaEvent_t ev_start_low;
  cudaEvent_t ev_start_hi;
  cudaEvent_t ev_end_low;
  cudaEvent_t ev_end_hi;
  CHECK(cudaEventCreate(&ev_start_low));
  CHECK(cudaEventCreate(&ev_start_hi));
  CHECK(cudaEventCreate(&ev_end_low));
  CHECK(cudaEventCreate(&ev_end_hi));

  /* */

  // call pair of kernels repeatedly (with different priority streams)
  CHECK(cudaEventRecord(ev_start_low, st_low));
  CHECK(cudaEventRecord(ev_start_hi, st_hi));

  for (int i = 0; i < TOTAL_SIZE; i += EACH_SIZE) {
    int j = i / sizeof(int);
    memcpy_kernel<<<TBLOCKS, THREADS, 0, st_low>>>(d_dst_low + j, d_src_low + j, EACH_SIZE);
    memcpy_kernel<<<TBLOCKS, THREADS, 0, st_hi>>>(d_dst_hi + j, d_src_hi + j, EACH_SIZE);
  }

  CHECK(cudaEventRecord(ev_end_low, st_low));
  CHECK(cudaEventRecord(ev_end_hi, st_hi));

  CHECK(cudaEventSynchronize(ev_end_low));
  CHECK(cudaEventSynchronize(ev_end_hi));

  /* */

  size = TOTAL_SIZE;
  CHECK(cudaMemcpy(h_dst_low, d_dst_low, size, cudaMemcpyDeviceToHost));
  CHECK(cudaMemcpy(h_dst_hi, d_dst_hi, size, cudaMemcpyDeviceToHost));

  // check results of kernels
  memcmp(h_dst_low, h_src_low, size);
  memcmp(h_dst_hi, h_src_hi, size);

  // check timings
  float ms_low;
  float ms_hi;
  CHECK(cudaEventElapsedTime(&ms_low, ev_start_low, ev_end_low));
  CHECK(cudaEventElapsedTime(&ms_hi, ev_start_hi, ev_end_hi));

  printf("elapsed time of kernels launched to LOW priority stream: %.3lf ms\n", ms_low);
  printf("elapsed time of kernels launched to HI  priority stream: %.3lf ms\n", ms_hi);

  exit(EXIT_SUCCESS);
}

# ✅ Somma array con stream

This example demonstrates overlapping computation and communication by
partitioning a data set and asynchronously launching the memory copies and kernels for each subset. Launching all transfers and kernels for a given subset in the same CUDA stream ensures that computation on the device is not started until the necessary data has been transferred. However, because the work of each subset is independent of all other subsets, the communication and computation of different subsets will overlap.

This example launches copies and kernels in breadth-first order.

In [None]:
%%cuda

#include "../../GPUcomputing/utils/common.h"

#define NSTREAM 4
#define BDIM 128

void initialData(float *ip, int size) {
  int i;

  for(i = 0; i < size; i++)
    ip[i] = (float)(rand() & 0xFF) / 10.0f;
}

void sumArraysOnHost(float *A, float *B, float *C, const int N) {
  for (int idx = 0; idx < N; idx++)
    C[idx] = A[idx] + B[idx];
}

__global__ void sumArrays(float *A, float *B, float *C, const int N) {
  int idx = blockIdx.x * blockDim.x + threadIdx.x;

  if (idx < N)
    C[idx] = A[idx] + B[idx];
}

void checkResult(float *hostRef, float *gpuRef, const int N) {
  double epsilon = 1.0E-8;
  bool match = 1;

  for (int i = 0; i < N; i++) {
    if (abs(hostRef[i] - gpuRef[i]) > epsilon) {
      match = 0;
      printf("Arrays do not match!\n");
      printf("host %5.2f gpu %5.2f at %d\n", hostRef[i], gpuRef[i], i);
      break;
    }
  }
  if (match)
    printf("Arrays match.\n\n");
}

/******************************************************************************/
/*                                  MAIN																			*/
/******************************************************************************/
int main(int argc, char **argv) {
  printf("> %s Starting...\n", argv[0]);

  int dev = 0;
  cudaDeviceProp deviceProp;
  CHECK(cudaGetDeviceProperties(&deviceProp, dev));
  printf("> Using Device %d: %s\n", dev, deviceProp.name);
  CHECK(cudaSetDevice(dev));

  // check if device support hyper-q
  if (deviceProp.major < 3 || (deviceProp.major == 3 && deviceProp.minor < 5)) {
    if (deviceProp.concurrentKernels == 0) {
      printf("> GPU does not support concurrent kernel execution (SM 3.5 or higher required)\n");
      printf("> CUDA kernel runs will be serialized\n");
    }
    else {
      printf("> GPU does not support HyperQ\n");
      printf("> CUDA kernel runs will have limited concurrency\n");
    }
  }

  // Shows whether the device can transfer in both directions simultaneously
  printf("> Device %s capable of simultaneous CPU-to-GPU datatransfers\n", deviceProp.deviceOverlap ? "IS": "NOT");

  printf("> Compute Capability %d.%d hardware with %d multi-processors\n",
          deviceProp.major, deviceProp.minor, deviceProp.multiProcessorCount);

  printf ("> with streams = %d\n", NSTREAM);

  // set up data size of vectors
  int nElem = 1 << 26;
  printf("> vector size = %d\n", nElem);
  size_t nBytes = nElem * sizeof(float);

  // malloc pinned host memory for async memcpy
  float *h_A, *h_B, *hostRef, *gpuRef;
  CHECK(cudaHostAlloc((void**)&h_A, nBytes, cudaHostAllocDefault));
  CHECK(cudaHostAlloc((void**)&h_B, nBytes, cudaHostAllocDefault));
  CHECK(cudaHostAlloc((void**)&gpuRef, nBytes, cudaHostAllocDefault));
  CHECK(cudaHostAlloc((void**)&hostRef, nBytes, cudaHostAllocDefault));

  // initialize data at host side
  initialData(h_A, nElem);
  initialData(h_B, nElem);
  memset(hostRef, 0, nBytes);
  memset(gpuRef,  0, nBytes);

  // add vector at host side for result checks
  sumArraysOnHost(h_A, h_B, hostRef, nElem);

  // malloc device global memory
  float *d_A, *d_B, *d_C;
  CHECK(cudaMalloc((float**)&d_A, nBytes));
  CHECK(cudaMalloc((float**)&d_B, nBytes));
  CHECK(cudaMalloc((float**)&d_C, nBytes));

  cudaEvent_t start, stop;
  CHECK(cudaEventCreate(&start));
  CHECK(cudaEventCreate(&stop));

  // invoke kernel at host side
  dim3 block (BDIM);
  dim3 grid  ((nElem + block.x - 1) / block.x);
  printf("> grid (%d, %d) block (%d, %d)\n", grid.x, grid.y, block.x, block.y);

  // sequential operation
  CHECK(cudaEventRecord(start, 0));
  CHECK(cudaMemcpy(d_A, h_A, nBytes, cudaMemcpyHostToDevice));
  CHECK(cudaMemcpy(d_B, h_B, nBytes, cudaMemcpyHostToDevice));
  CHECK(cudaEventRecord(stop, 0));
  CHECK(cudaEventSynchronize(stop));
  float memcpy_h2d_time;
  CHECK(cudaEventElapsedTime(&memcpy_h2d_time, start, stop));

  CHECK(cudaEventRecord(start, 0));
  sumArrays<<<grid, block>>>(d_A, d_B, d_C, nElem);
  CHECK(cudaEventRecord(stop, 0));
  CHECK(cudaEventSynchronize(stop));
  float kernel_time;
  CHECK(cudaEventElapsedTime(&kernel_time, start, stop));

  CHECK(cudaEventRecord(start, 0));
  CHECK(cudaMemcpy(gpuRef, d_C, nBytes, cudaMemcpyDeviceToHost));
  CHECK(cudaEventRecord(stop, 0));
  CHECK(cudaEventSynchronize(stop));
  float memcpy_d2h_time;
  CHECK(cudaEventElapsedTime(&memcpy_d2h_time, start, stop));
  float itotal = kernel_time + memcpy_h2d_time + memcpy_d2h_time;

  printf("\n");
  printf("Measured timings (throughput):\n");
  printf(" Memcpy host to device\t: %f ms (%f GB/s)\n", memcpy_h2d_time, (nBytes * 1e-6) / memcpy_h2d_time);
  printf(" Memcpy device to host\t: %f ms (%f GB/s)\n", memcpy_d2h_time, (nBytes * 1e-6) / memcpy_d2h_time);
  printf(" Kernel\t\t\t: %f ms (%f GB/s)\n", kernel_time, (nBytes * 2e-6) / kernel_time);
  printf(" Total\t\t\t: %f ms (%f GB/s)\n", itotal, (nBytes * 2e-6) / itotal);

  // grid parallel operation
  int iElem = nElem / NSTREAM;
  size_t iBytes = iElem * sizeof(float);
  grid.x = (iElem + block.x - 1) / block.x;

  cudaStream_t stream[NSTREAM];

  for (int i = 0; i < NSTREAM; ++i)
    CHECK(cudaStreamCreate(&stream[i]));

  CHECK(cudaEventRecord(start, 0));

  // initiate all asynchronous transfers to the device
  for (int i = 0; i < NSTREAM; ++i) {
    int ioffset = i * iElem;
    CHECK(cudaMemcpyAsync(&d_A[ioffset], &h_A[ioffset], iBytes, cudaMemcpyHostToDevice, stream[i]));
    CHECK(cudaMemcpyAsync(&d_B[ioffset], &h_B[ioffset], iBytes, cudaMemcpyHostToDevice, stream[i]));
  }

  // launch a kernel in each stream
  for (int i = 0; i < NSTREAM; ++i) {
    int ioffset = i * iElem;
    sumArrays<<<grid, block, 0, stream[i]>>>(&d_A[ioffset], &d_B[ioffset], &d_C[ioffset], iElem);
  }

  // enqueue asynchronous transfers from the device
  for (int i = 0; i < NSTREAM; ++i) {
    int ioffset = i * iElem;
    CHECK(cudaMemcpyAsync(&gpuRef[ioffset], &d_C[ioffset], iBytes, cudaMemcpyDeviceToHost, stream[i]));
  }

  CHECK(cudaEventRecord(stop, 0));
  CHECK(cudaEventSynchronize(stop));
  float execution_time;
  CHECK(cudaEventElapsedTime(&execution_time, start, stop));

  printf("\n");
  printf("Actual results from overlapped data transfers:\n");
  printf(" overlap with %d streams : %f ms (%f GB/s)\n", NSTREAM, execution_time, (nBytes * 2e-6) / execution_time );
  printf(" speedup                : %f \n", itotal/execution_time);

  // check kernel error
  CHECK(cudaGetLastError());

  // check device results
  checkResult(hostRef, gpuRef, nElem);

  // free device global memory
  CHECK(cudaFree(d_A));
  CHECK(cudaFree(d_B));
  CHECK(cudaFree(d_C));

  // free host memory
  CHECK(cudaFreeHost(h_A));
  CHECK(cudaFreeHost(h_B));
  CHECK(cudaFreeHost(hostRef));
  CHECK(cudaFreeHost(gpuRef));

  // destroy events
  CHECK(cudaEventDestroy(start));
  CHECK(cudaEventDestroy(stop));

  // destroy streams
  for (int i = 0; i < NSTREAM; ++i)
    CHECK(cudaStreamDestroy(stream[i]));

  CHECK(cudaDeviceReset());
  return(0);
}


# ✅ Tabular


### ↘️ *`TODO...`*

1. Come modificare il kernel per usare gli stream
2. Gestione della memoria pinned e device

Applicare
3. Schema: loop over {copy, kernel, copy}
4. Schema: loop over {copy H2D}, loop over {kernel}, loop over {copy D2H}


In [None]:
%%cuda

#include <stdio.h>
#include "../../GPUcomputing/utils/common.h"

#define PI 3.141592f

/*
 * Kernel: tabular function
 */
__global__ void tabular(float *a, int n) {
	int i = threadIdx.x + blockIdx.x * blockDim.x;
	if (i < n) {
		float x = PI * (float)i / (float)n;
		float s = sinf(x);
		float c = cosf(x);
		a[i] = sqrtf(abs(s * s - c * c));
	}
}

/*
 * Kernel: tabular function using streams
 */
__global__ void tabular_streams(float *a, int n, int offset) {
	int i = offset + threadIdx.x + blockIdx.x * blockDim.x;
  if (i < n) {
    float x = PI * (float)i / (float)n;
    float s = sinf(x);
    float c = cosf(x);
    a[i] = sqrtf(abs(s * s - c * c));
  }
}


/******************************************************************************/
/*                                  MAIN																			*/
/******************************************************************************/
int main(void) {

  // main params
  uint MB = 1024*1024;
  uint n = 256*MB;
	int blockSize = 256;
	int nStreams = 8;

	int streamSize = n / nStreams;
	int streamBytes = streamSize * sizeof(float);
	int bytes = n * sizeof(float);

	int devId = 0;
	cudaDeviceProp prop;
	CHECK(cudaGetDeviceProperties(&prop, devId));
	printf("Device : %s\n\n", prop.name);
	CHECK(cudaSetDevice(devId));
  printf("Array size   : %d\n", n);
  printf("StreamSize   : %d\n", streamSize);
  printf("Memory bytes : %d (MB)\n", bytes/MB);
  printf("streamBytes  : %d (MB)\n", streamBytes/MB);

	// allocate pinned host memory and device memory
	float *a, *d_a;
	CHECK(cudaMallocHost((void**) &a, bytes));      // host pinned
	CHECK(cudaMalloc((void**) &d_a, bytes));        // device

	float ms; // elapsed time in milliseconds

	// create events and streams
	cudaEvent_t startEvent, stopEvent, dummyEvent;
	cudaStream_t stream[nStreams];
	CHECK(cudaEventCreate(&startEvent));
	CHECK(cudaEventCreate(&stopEvent));
	CHECK(cudaEventCreate(&dummyEvent));
	for (int i = 0; i < nStreams; ++i)
		CHECK(cudaStreamCreate(&stream[i]));

	/******************************************************************************/
	/*       baseline case - sequential transfer and execute      							  */
	/******************************************************************************/
	memset(a, 0, bytes);
	CHECK(cudaEventRecord(startEvent, 0));
	CHECK(cudaMemcpy(d_a, a, bytes, cudaMemcpyHostToDevice));
	tabular<<<n / blockSize, blockSize>>>(d_a, n);
	CHECK(cudaMemcpy(a, d_a, bytes, cudaMemcpyDeviceToHost));
	CHECK(cudaEventRecord(stopEvent, 0));
	CHECK(cudaEventSynchronize(stopEvent));
	CHECK(cudaEventElapsedTime(&ms, startEvent, stopEvent));
	printf("\nTime for sequential transfer and execute (ms): %f\n", ms);

  /******************************************************************************/
	/*       asynchronous version 1: loop over {copy, kernel, copy}							  */
	/******************************************************************************/
	memset(a, 0, bytes);
	CHECK(cudaEventRecord(startEvent, 0));
	for (int i = 0; i < nStreams; ++i) {
		int offset = i * streamSize;
		CHECK(cudaMemcpyAsync(&d_a[offset], &a[offset], streamBytes,cudaMemcpyHostToDevice, stream[i]));
		tabular_streams<<<streamSize / blockSize, blockSize, 0, stream[i]>>>(d_a, n, offset);
		CHECK(cudaMemcpyAsync(&a[offset], &d_a[offset], streamBytes, cudaMemcpyDeviceToHost, stream[i]));
	}
	CHECK(cudaEventRecord(stopEvent, 0));
	CHECK(cudaEventSynchronize(stopEvent));
	CHECK(cudaEventElapsedTime(&ms, startEvent, stopEvent));
	printf("\nTime for asynchronous loop over {copy, kernel, copy} transfer and execute (ms): %f\n", ms);


  /******************************************************************************/
	/* asynchronous version 2: loop over copy, loop over kernel, loop over copy	  */
	/******************************************************************************/

  memset(a, 0, bytes);
	CHECK(cudaEventRecord(startEvent, 0));
	for (int i = 0; i < nStreams; ++i) {
		int offset = i * streamSize;
		CHECK(cudaMemcpyAsync(&d_a[offset], &a[offset], streamBytes,cudaMemcpyHostToDevice, stream[i]));
	}
	for (int i = 0; i < nStreams; ++i) {
		int offset = i * streamSize;
		tabular_streams<<<streamSize / blockSize, blockSize, 0, stream[i]>>>(d_a,n,offset);
	}
	for (int i = 0; i < nStreams; ++i) {
		int offset = i * streamSize;
		CHECK(cudaMemcpyAsync(&a[offset], &d_a[offset], streamBytes,cudaMemcpyDeviceToHost, stream[i]));
	}
	CHECK(cudaEventRecord(stopEvent, 0));
	CHECK(cudaEventSynchronize(stopEvent));
	CHECK(cudaEventElapsedTime(&ms, startEvent, stopEvent));
	printf("\nTime for asynchronous loop over copy, loop over kernel, loop over copy transfer and execute (ms): %f\n", ms);

	// cleanup
	CHECK(cudaEventDestroy(startEvent));
	CHECK(cudaEventDestroy(stopEvent));
	CHECK(cudaEventDestroy(dummyEvent));
	for (int i = 0; i < nStreams; ++i)
		CHECK(cudaStreamDestroy(stream[i]));
	cudaFree(d_a);
	cudaFreeHost(a);

	return 0;
}


In [None]:
# Compilazione ed esecuzione

!nvcc -arch=sm_75 src/tmp/single_file.cu  -o tabular
!./tabular

In [None]:
# profilazione

!nvprof ./tabular

# ✅ MQDB con stream

### ↘️ *`TODO...`*

Disegnare un kernel per il prodotto tra matrici MQDB con le seguenti specifiche:
- Allocare spazio per matrici MQDB su CPU e GPU
- Confrontare uso di memoria unificata vs memoria asincrona
- Introdurre gli stream su cui distribuire il carico (grid parall.)
- Analisi di prestazioni usando i tempi ricavati con CUDA event

In [None]:
%%cuda_group_save --name "MQDB_stream_Unified.cu" --group "STREAMS"


#include "../../GPUcomputing/utils/MQDB/mqdb.h"
#include "../../GPUcomputing/utils/common.h"

#define BLOCK_SIZE 16     // block size
#define TEST_CPU 0

/*
 * Kernel for standard (naive) matrix product
 */
__global__ void matProdKernel(mqdb *A, mqdb *B, mqdb *C, int n) {
	// row & col indexes
	int row = blockIdx.y * blockDim.y + threadIdx.y;
	int col = blockIdx.x * blockDim.x + threadIdx.x;

	// each thread computes an entry of the product matrix
	if ((row < n) && (col < n)) {
		float val = 0;
		for (int k = 0; k < n; k++)
			val += A->elem[row * n + k] * B->elem[k * n + col];
		C->elem[row * n + col] = val;
	}
}

/*
 * Kernel for block sub-matrix product of mqdb
 */
__global__ void mqdbBlockProd(mqdb *A, mqdb *B, mqdb *C, uint sdim, uint d, uint n) {
	int row = blockIdx.y * blockDim.y + threadIdx.y;
	int col = blockIdx.x * blockDim.x + threadIdx.x;

	// jump to the right block sub-matrix
	uint  offset = (n+1)*sdim;

	// each thread computes an entry of the product matrix
	if ((row < d) && (col < d)) {
		float val = 0;
		for (int k = 0; k < d; k++)
			val += A->elem[row * n + k + offset] * B->elem[k * n + col + offset];
		C->elem[row * n + col + offset] = val;
	}
}


/*
 * Test on MQDB kernels using Unified Memory
 */
void testKernelsMQDB_unified(uint n, uint k, cudaEvent_t start, cudaEvent_t stop) {

	// matrix instance generation - Unified Memory
	mqdb *A, *B, *C;
	CHECK(cudaMallocManaged(&A, sizeof(mqdb)));
  CHECK(cudaMallocManaged(&A->blkSize, k*sizeof(int)));
  CHECK(cudaMallocManaged(&A->elem, n*n*sizeof(float)));

  CHECK(cudaMallocManaged(&B, sizeof(mqdb)));
  CHECK(cudaMallocManaged(&B->blkSize, k*sizeof(int)));
  CHECK(cudaMallocManaged(&B->elem, n*n*sizeof(float)));

  CHECK(cudaMallocManaged(&C, sizeof(mqdb)));
  CHECK(cudaMallocManaged(&C->blkSize, k*sizeof(int)));
  CHECK(cudaMallocManaged(&C->elem, n*n*sizeof(float)));

  // random fill mat entries
  int seed = 1;
	genRandDimsUnified(A, n, k, seed);
	genRandDimsUnified(B, n, k, seed);
	genRandDimsUnified(C, n, k, seed);
	fillBlocksUnified(A, n, k, 'C', 1);
	fillBlocksUnified(B, n, k, 'C', 2);
	fillBlocksUnified(C, n, k, 'C', 0);

	ulong nBytes = n * n * sizeof(float);
	printf("Memory size required = %3.4f (MB)\n",(float)nBytes/(1024.0*1024.0));


	/***********************************************************/
	/*                     GPU mat product                     */
	/***********************************************************/

  printf("Kernel (naive) mat product...\n");
	dim3 block(BLOCK_SIZE, BLOCK_SIZE);
	dim3 grid((n + block.x - 1) / block.x, (n + block.y - 1) / block.y);
  float milliseconds;
	CHECK(cudaEventRecord(start));
	matProdKernel<<<grid, block>>>(A, B, C, n);
  CHECK(cudaDeviceSynchronize());
	CHECK(cudaEventRecord(stop));
	CHECK(cudaEventSynchronize(stop));
	CHECK(cudaEventElapsedTime(&milliseconds, start, stop));
	float GPUtime1 = milliseconds / 1000.0;
	printf("   elapsed time               : %.4f (sec)\n", GPUtime1);
	//printf("   speedup vs CPU MQDB product: %.2f\n\n", CPUtime/GPUtime1);
	//mqdbDisplay(C);

	/***********************************************************/
	/*                     GPU MQDB product                    */
	/***********************************************************/

  printf("Kernel MQDB product...\n");
	uint sdim = 0;
	CHECK(cudaEventRecord(start));
	for (uint i = 0; i < k; i++ ) {
		uint d = A->blkSize[i];
		mqdbBlockProd<<<grid, block>>>(A, B, C, sdim, d, n);
		sdim += d;
	}
	CHECK(cudaDeviceSynchronize());
	CHECK(cudaEventRecord(stop));
	CHECK(cudaEventSynchronize(stop));
	CHECK(cudaEventElapsedTime(&milliseconds, start, stop));
	float GPUtime2 = milliseconds / 1000.0;
	printf("   elapsed time                  :  %.4f (sec)\n", GPUtime2);
	//printf("   speedup vs CPU MQDB product   :  %.2f\n", CPUtime/GPUtime2);
	printf("   speedup vs GPU std mat product:  %.4f\n\n", GPUtime1/GPUtime2);

  /***********************************************************/
	/*             GPU MQDB product using streams              */
	/***********************************************************/

  printf("Kernel MQDB product using streams...\n");

  // create and use streams
	int nstreams = A->nBlocks;
	cudaStream_t streams[nstreams];
	for (int i = 0; i < nstreams; i++)
		CHECK(cudaStreamCreate(&streams[i]));

	uint dsum = 0;  // bound dx
	CHECK(cudaEventRecord(start));
	for (int i = 0; i < nstreams; i++) {
		uint d = A->blkSize[i];
		dim3 grid((d + block.x - 1) / block.x, (d + block.y - 1) / block.y);
		mqdbBlockProd<<<grid, block, 0, streams[i]>>>(A, B, C, dsum, d, n);
		dsum += d;
	}
	CHECK(cudaDeviceSynchronize());
	CHECK(cudaEventRecord(stop));
	CHECK(cudaEventSynchronize(stop));
	CHECK(cudaEventElapsedTime(&milliseconds, start, stop));
	float GPUtime3 = milliseconds / 1000.0;
	printf("   elapsed time                  : %.5f (sec)\n", GPUtime3);
	//printf("   speedup vs CPU MQDB product   : %.2f\n", CPUtime/GPUtime3);
	printf("   speedup vs GPU std mat product: %.2f\n",GPUtime1/GPUtime3);
  printf("   speedup vs GPU MQDB product   : %.2f\n",GPUtime2/GPUtime3);
  //mqdbDisplay(C);

	// clean up streams and events
	for (int i = 0; i < nstreams; i++)
		cudaStreamDestroy(streams[i]);

}

/******************************************************************************/
/*                                  MAIN																			*/
/******************************************************************************/

int main(int argc, char *argv[]) {

  // set up device
	int dev = 0;
	cudaDeviceProp deviceProp;
	CHECK(cudaGetDeviceProperties(&deviceProp, dev));
	printf("%s starting mqdb product at ", argv[0]);
	printf("device %d: %s\n", dev, deviceProp.name);
	CHECK(cudaSetDevice(dev));

	// events to measure time
	cudaEvent_t start, stop;
	cudaEventCreate(&start);
	cudaEventCreate(&stop);

	uint n = 8*1024;         // matrix size
	uint min_k = 10;         // min num of blocks
	uint max_k = 100;         // max num of blocks

	// multiple tests for k = # diag blocks
	for (uint k = min_k; k <= max_k; k+=10) {
		printf("\n*****   k = %d --- (avg block size = %f)\n",k,(float)n/k);
		testKernelsMQDB_unified(n, k, start, stop);
	}

  cudaEventDestroy(start);
	cudaEventDestroy(stop);
	return 0;
}




In [None]:
# Compilazione ed esecuzione
!nvcc -arch=sm_75  src/STREAMS/MQDB_stream_Unified.cu GPUcomputing/utils/MQDB/mqdb.cpp -o MQDBS
!./MQDBS

In [None]:
%%cuda_group_save --name "MQDB_stream_manual.cu" --group "STREAMS"

#include "../../GPUcomputing/utils/MQDB/mqdb.h"
#include "../../GPUcomputing/utils/common.h"

#define BLOCK_SIZE 16     // block size

/*
 * Kernel for block sub-matrix product of mqdb
 */
__global__ void mqdbBlockProd(mqdb A, mqdb B, mqdb C, uint sdim, uint d, uint n) {
	int row = blockIdx.y * blockDim.y + threadIdx.y;
	int col = blockIdx.x * blockDim.x + threadIdx.x;

	// jump to the right block sub-matrix
	uint  offset = (n+1)*sdim;

	// each thread computes an entry of the product matrix
	if ((row < d) && (col < d)) {
		float val = 0;
		for (int k = 0; k < d; k++)
			val += A.elem[row * n + k + offset] * B.elem[k * n + col + offset];
		C.elem[row * n + col + offset] = val;
	}
}

/*
 * Test on MQDB kernels using manual async memory
 */
void testKernelsMQDB_manual_mem(uint n, uint k, cudaEvent_t start, cudaEvent_t stop) {

	// matrices
	mqdb *A, *B, *C;         // host
	mqdb d_A, d_B, d_C;      // device

  ulong nBytes = n * n * sizeof(float);
  int kBytes = k * sizeof(int);
	printf("Memory size required = %3.4f (MB)\n",(float)nBytes/(1024.0*1024.0));


  // host and device Memory
	CHECK(cudaMallocHost(&A, sizeof(mqdb)));
  CHECK(cudaMallocHost(&A->blkSize, kBytes));
  CHECK(cudaMallocHost(&A->elem, nBytes));
  CHECK(cudaMalloc(&d_A.blkSize, kBytes));
  CHECK(cudaMalloc(&d_A.elem, nBytes));

  CHECK(cudaMallocHost(&B, sizeof(mqdb)));
  CHECK(cudaMallocHost(&B->blkSize, kBytes));
  CHECK(cudaMallocHost(&B->elem, nBytes));
	CHECK(cudaMalloc(&d_B.blkSize, kBytes));
  CHECK(cudaMalloc(&d_B.elem, nBytes));

  CHECK(cudaMallocHost(&C, sizeof(mqdb)));
  CHECK(cudaMallocHost(&C->blkSize, kBytes));
  CHECK(cudaMallocHost(&C->elem, nBytes));
	CHECK(cudaMalloc(&d_C.blkSize, kBytes));
  CHECK(cudaMalloc(&d_C.elem, nBytes));

  // random fill mat entries
  int seed = 1;
	genRandDims(A, n, k, seed);
	genRandDims(B, n, k, seed);
	genRandDims(C, n, k, seed);
	fillBlocks(A, n, k, 'C', 1);
	fillBlocks(B, n, k, 'C', 2);
	fillBlocks(C, n, k, 'C', 0);

	// copy blk sizes on device memory
	//CHECK(cudaMemcpy(d_A->blkSize, A->blkSize, kBytes, cudaMemcpyHostToDevice));
	//CHECK(cudaMemcpy(d_B->blkSize, B->blkSize, kBytes, cudaMemcpyHostToDevice));
	//CHECK(cudaMemcpy(d_C->blkSize, C->blkSize, kBytes, cudaMemcpyHostToDevice));

  /***********************************************************/
	/*       GPU MQDB product using streams & async copy       */
	/***********************************************************/
	printf("GPU MQDB product using streams...\n");

	dim3 block(BLOCK_SIZE, BLOCK_SIZE);
  int nstreams = A->nBlocks;
	cudaStream_t streams[nstreams];
	uint dsum = 0;  // bound dx
	CHECK(cudaEventRecord(start));
	for (int i = 0; i < nstreams; i++) {
		CHECK(cudaStreamCreate(&streams[i]));
		uint d = A->blkSize[i];
    int offset = dsum*n;
    int streamBytes = d*n;
		CHECK(cudaMemcpyAsync(&d_A.elem[offset], &A->elem[offset], streamBytes, cudaMemcpyHostToDevice, streams[i]));
    CHECK(cudaMemcpyAsync(&d_B.elem[offset], &B->elem[offset], streamBytes, cudaMemcpyHostToDevice, streams[i]));
		dim3 grid((d + block.x - 1) / block.x, (d + block.y - 1) / block.y);
		mqdbBlockProd<<<grid, block, 0, streams[i]>>>(d_A, d_B, d_C, dsum, d, n);
    CHECK(cudaMemcpyAsync(&C->elem[offset], &d_C.elem[offset], streamBytes, cudaMemcpyDeviceToHost, streams[i]));
		dsum += d;
	}
	CHECK(cudaEventRecord(stop));
	CHECK(cudaEventSynchronize(stop));
  float milliseconds;
	CHECK(cudaEventElapsedTime(&milliseconds, start, stop));
	float GPUtime3 = milliseconds / 1000.0;
	printf("   elapsed time                  : %.5f (sec)\n", GPUtime3);

	// clean up streams and events
	for (int i = 0; i < nstreams; i++)
		cudaStreamDestroy(streams[i]);

}

/******************************************************************************/
/*                                  MAIN																			*/
/******************************************************************************/

int main(int argc, char *argv[]) {

  // set up device
	int dev = 0;
	cudaDeviceProp deviceProp;
	CHECK(cudaGetDeviceProperties(&deviceProp, dev));
	printf("%s starting mqdb product at ", argv[0]);
	printf("device %d: %s\n", dev, deviceProp.name);
	CHECK(cudaSetDevice(dev));

	// events to measure time
	cudaEvent_t start, stop;
	cudaEventCreate(&start);
	cudaEventCreate(&stop);

	uint n = 16*1024;         // matrix size
	uint min_k = 20;       // max num of blocks
	uint max_k = 30;       // max num of blocks

	// multiple tests for k = # diag blocks
	for (uint k = min_k; k <= max_k; k+=5) {
		printf("\n*****   k = %d --- (avg block size = %f)\n",k,(float)n/k);
		testKernelsMQDB_manual_mem(n, k, start, stop);
	}

  cudaEventDestroy(start);
	cudaEventDestroy(stop);
	return 0;
}





In [None]:
# Compilazione ed esecuzione
!nvcc -arch=sm_75  src/STREAMS/MQDB_stream_manual.cu GPUcomputing/utils/MQDB/mqdb.cpp -o MQDBS
!./MQDBS