<a href="https://colab.research.google.com/github/mmmovania/CUDA_Spring2022_GoogleColabs/blob/main/Week13/CUDA_Streams_Breadth.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%cd /usr/local/
!rm -rf cuda
!ln -s /usr/local/cuda-10.1 /usr/local/cuda
!stat cuda
!pip install git+https://github.com/andreinechaev/nvcc4jupyter.git
%load_ext nvcc_plugin

/usr/local
  File: cuda -> /usr/local/cuda-10.1
  Size: 20        	Blocks: 0          IO Block: 4096   symbolic link
Device: 24h/36d	Inode: 3276803     Links: 1
Access: (0777/lrwxrwxrwx)  Uid: (    0/    root)   Gid: (    0/    root)
Access: 2022-04-14 19:54:44.533536995 +0000
Modify: 2022-04-14 19:54:44.412537155 +0000
Change: 2022-04-14 19:54:44.412537155 +0000
 Birth: -
Collecting git+https://github.com/andreinechaev/nvcc4jupyter.git
  Cloning https://github.com/andreinechaev/nvcc4jupyter.git to /tmp/pip-req-build-tcb47z4s
  Running command git clone -q https://github.com/andreinechaev/nvcc4jupyter.git /tmp/pip-req-build-tcb47z4s
Building wheels for collected packages: NVCCPlugin
  Building wheel for NVCCPlugin (setup.py) ... [?25l[?25hdone
  Created wheel for NVCCPlugin: filename=NVCCPlugin-0.0.2-py3-none-any.whl size=4306 sha256=fe9353eb5c8dbdf7ab5ef9cbe8dc300b785310b77de414c5b12606f412fda46c
  Stored in directory: /tmp/pip-ephem-wheel-cache-7sjd3q8n/wheels/ca/33/8d/3c86eb85e97d

In [2]:
%%cu

#include <stdio.h>

#define N 1000000000
#define NSTREAM 4

inline cudaError_t checkCudaErr(cudaError_t err, const char* msg) {
	if (err != cudaSuccess) {
		fprintf(stderr, "CUDA Runtime error at %s: %s\n", msg, cudaGetErrorString(err));
	}
	return err;
}

__global__ void kernel_1() 
{
	double sum = 0.0;
	for(int i = 0; i < N; i++)
  {
			sum += sum + tan(0.1) * tan(0.1);
	}
}

__global__ void kernel_2() 
{
	double sum = 0.0;
	for(int i = 0; i < N; i++)
  {
			sum += sum + tan(0.1) * tan(0.1);
	}
}

__global__ void kernel_3() 
{
	double sum = 0.0;
	for(int i = 0; i < N; i++)
  {
			sum += sum + tan(0.1) * tan(0.1);
	}
}

__global__ void kernel_4() 
{
	double sum = 0.0;
	for(int i = 0; i < N; i++)
  {
			sum += sum + tan(0.1) * tan(0.1);
	}
}

int main() 
{ 
	float elapsed_time = 0;
	int bigcase = 1;
	int isize = 1;
  int iblock = 1;
	
	const int n_streams = NSTREAM;
	cudaStream_t* streams = (cudaStream_t*) malloc(n_streams * sizeof(cudaStream_t));
	
	for (int i = 0 ; i < n_streams; i++) 
  {
     checkCudaErr(cudaStreamCreate(&streams[i]), "stream creation");
	}
  
	
    // run kernel with more threads
    if (bigcase == 1)
    {
        iblock = 512;
        isize = 1 << 12;
    }

    // set up execution configuration
    dim3 block (iblock);
    dim3 grid  (isize / iblock);
  
	// creat events
  cudaEvent_t start, stop;
  checkCudaErr(cudaEventCreate(&start), "event create (start)");
  checkCudaErr(cudaEventCreate(&stop), "event create (stop)");

  // record start event
  checkCudaErr(cudaEventRecord(start, 0), "cudaEventRecord(start)");
	
	for(int i=0; i<n_streams; ++i)  
			kernel_1 <<< grid, block, 0, streams[i] >>> ();
	for(int i=0; i<n_streams; ++i)  
			kernel_2 <<< grid, block, 0, streams[i] >>> ();
	for(int i=0; i<n_streams; ++i)  
			kernel_3 <<< grid, block, 0, streams[i] >>> ();
	for(int i=0; i<n_streams; ++i)  
			kernel_4 <<< grid, block, 0, streams[i] >>> ();
	 
  // record stop event
  checkCudaErr(cudaEventRecord(stop, 0), "cudaEventRecord");
  checkCudaErr(cudaEventSynchronize(stop), "cudaEventSynchronize");
	
	// calculate elapsed time
  checkCudaErr(cudaEventElapsedTime(&elapsed_time, start, stop), "cudaEventElapsedTime");
	printf("Measured time for parallel execution = %.3fs\n", elapsed_time / 1000.0f);
	
	// release all stream
	for (int i = 0 ; i < n_streams ; i++)
  {
     checkCudaErr(cudaStreamDestroy(streams[i]), "cudaStreamDestroy");
  }

  free(streams);
	
	// free memory on the gpu side
  // destroy events
  checkCudaErr(cudaEventDestroy(start), "cudaEventDestroy(start)");
  checkCudaErr(cudaEventDestroy(stop), "cudaEventDestroy(stop)");
	checkCudaErr( cudaDeviceReset(), "cudaDeviceReset");

	return 0;
}

Measured time for parallel execution = 0.000s

