---
# **LAB 10 - Parallel patterns**
---

# ▶️ CUDA setup

In [None]:
!nvcc --version

In [None]:
!nvidia-smi

GPU computing notebooks download (from github)

In [None]:
!git clone https://github.com/giulianogrossi/GPUcomputing.git

NVCC Plugin for Jupyter notebook

In [None]:
%cd GPUcomputing/utils/nvcc4jupyter-master/
!python3 -m build
%load_ext nvcc4jupyter
%cd /content/

# ▶️ DeviceQuery

In [None]:
# DeviceQuery dell'attuale device (su Colab!)
!nvcc -arch=sm_75 /content/GPUcomputing/utils/deviceQuery.cu -o deviceQuery
!./deviceQuery

# ✅ Scan

In [None]:
%%cuda
#include <stdio.h>
#include <stdlib.h>
#include "cublas_v2.h"
#include "../../GPUcomputing/utils/common.h"

#define BLOCK_SIZE 512


__global__ void block_scan(int *input, int *output) {
   __shared__ int smem[BLOCK_SIZE];
   int tid = threadIdx.x;                  // local index: 0:BLOCK_SIZE-1
   int i = tid + blockIdx.x * blockDim.x;  // global index 0:n-1

   // Load input into shared memory.
   smem[tid] = input[i];
   __syncthreads();

   // do recursive sums
   for (int d = 1; d < BLOCK_SIZE; d *= 2) {
      if (tid >= d)
         smem[tid] += smem[tid - d];
      __syncthreads();
   }
   input[i] = smem[tid];
   if (i == ((blockIdx.x + 1) * blockDim.x - 1))
      output[blockIdx.x] = input[i];
}

__global__ void sum_block_scan(int *input, int *output) {
   int i = threadIdx.x + blockIdx.x * blockDim.x;  // global index 0:n-1
   int s = 0;

   if (blockIdx.x == 0) return;

   for (int j = 0; j < blockIdx.x; j++)
      s += output[j];

   // add term to input
   input[i] += s;
}

/*
 * MAIN: test on parallel reduction
 */
int main(void) {
	int *a, *b, *d_a, *d_b, *d_s;
	int blockSize = BLOCK_SIZE;      // block dim 1D
	ulong numBlock = 100*1024;       // grid dim 1D
	ulong n = blockSize * numBlock;  // array dim
	ulong nByte = n*sizeof(int);
	ulong mByte = numBlock*sizeof(int);

	printf("\n****  test on parallel scan  ****\n");
	printf("  Vector length: %ld\n", n);

   // events to measure time
	cudaEvent_t start, stop;
	cudaEventCreate(&start);
	cudaEventCreate(&stop);

	// init mems
	a = (int *) malloc(nByte);
	b = (int *) malloc(nByte);
	CHECK(cudaMalloc((void **) &d_a, nByte));
	CHECK(cudaMalloc((void **) &d_b, nByte));
	CHECK(cudaMalloc((void **) &d_s, mByte));

	/***********************************************************/
	/*                     CPU reduction                       */
	/***********************************************************/

   // reset input
	for (ulong i = 0; i < n; i++) a[i] = 1;

	printf("\n  CPU procedure...\n");
	double go = seconds();
   for (ulong i = 1; i < n; i++)
      a[i] += a[i-1];
	double CPUtime = seconds() - go;
	printf("    Elapsed time: %f (sec) \n", CPUtime);

   /***********************************************************/
	/*                      KERNEL block scan                        */
	/***********************************************************/

   // reset input
   for (ulong i = 0; i < n; i++) a[i] = 1;

	printf("\n  block scan...\n");
   CHECK(cudaMemcpy(d_a, a, n * sizeof(int), cudaMemcpyHostToDevice));
   CHECK(cudaMemset(d_s, 0, mByte));

	cudaEventRecord(start);
   block_scan<<< numBlock, blockSize >>>(d_a, d_s);
   sum_block_scan<<< numBlock, blockSize >>>(d_a, d_s);
	CHECK(cudaDeviceSynchronize());
	CHECK(cudaEventRecord(stop));
	CHECK(cudaEventSynchronize(stop));
	CHECK(cudaGetLastError());
   float milliseconds;
	cudaEventElapsedTime(&milliseconds, start, stop);
	double GPUtime = milliseconds / 1000.0;
	printf("   elapsed time:   %.5f (sec)\n", GPUtime);
	double speedup = CPUtime/GPUtime;
	printf("    Speedup %.1f\n", speedup);

   CHECK(cudaMemcpy(a, d_a, nByte, cudaMemcpyDeviceToHost));

   /***********************************************************/
	/*                      KERNEL scan                        */
	/***********************************************************/

	cudaFree(d_a);
	cudaFree(d_b);
	cudaFree(d_s);

	return 0;
}



# ✅ MergeSort

CPU mergeSort

In [None]:
%%cuda_group_save --name "mergesortCPU.cu" --group "ALG"

#include <stdlib.h>
#include <stdio.h>

/**
 *  Merge the two half into a sorted data
 */
void merge(int arr[], int l, int m, int r) {
	//	printf("merge: l = %d, m = %d, r = %d\n",l,m,r);
	int i, j, k;
	int n1 = m - l + 1;
	int n2 = r - m;

	// Create temp arrays
	int *L = new int[n1];
	int *R = new int[n2];

	// Copy data to temp arrays L[] and R[]
	for (i = 0; i < n1; i++)
		L[i] = arr[l + i];
	for (j = 0; j < n2; j++)
		R[j] = arr[m + 1 + j];

	// Merge the temp arrays back into arr[l..r]

	i = 0; // Initial index of first subarray
	j = 0; // Initial index of second subarray
	k = l; // Initial index of merged subarray

	while (i < n1 && j < n2) {
		if (L[i] <= R[j]) {
			arr[k] = L[i];
			i++;
		} else {
			arr[k] = R[j];
			j++;
		}
		k++;
	}

	// Copy the remaining elements of L[], if there are any
	while (i < n1) {
		arr[k] = L[i];
		i++;
		k++;
	}

	// Copy the remaining elements of R[], if there are any
	while (j < n2) {
		arr[k] = R[j];
		j++;
		k++;
	}
}

// l is for left index and r is right index of the
// sub-array of arr to be sorted
void mergeSort(int arr[], int l, int r) {
	//	printf("mergeSort: l = %d, r = %d\n",l,r);
	if (l < r) {
		int m = (l + r) / 2;

		// Sort first and second halves
		mergeSort(arr, l, m);
		mergeSort(arr, m + 1, r);

		// merge in backtracking step
		merge(arr, l, m, r);
	}
}

/**
 * Function that fills an array with random integers
 * @param int* array Reference to the array that will be filled
 * @param int  size  Number of elements
 */
void random_array(int *array, unsigned size, int seed) {
	srand(seed);
	int i;
	for (i = 0; i < size; i++) {
		array[i] = rand() % size;
	}
}

/**
 * Function that checks whether the sorting is correct
 * @param int* array Reference to the array
 * @param int  size  Number of elements
 */
void check_up_sorting(int array[], unsigned size) {
	bool flag = true;
	for (int i = 0; i < size - 1; i++)
		if (array[i] > array[i + 1]) {
			printf("Sorting error! array[%d]=%d array[%d]=%d\n", i, array[i], i + 1,
					array[i + 1]);
			flag = false;
			break;
		}
	if (flag)
		printf("   Sorting OK!\n");
}

/*
 * Function to print an array
 */
void printArray(int arr[], int size, int k) {
	for (int i = 0; i < size; i++) {
		if (i>0 && k > 0 && i%k==0)
			printf("\n");
		printf("%d ", arr[i]);
	}
	printf("\n\n");
}

/*
 * Function to print an array
 */
void arrayCopy(int dst[], const int src[], const int size) {
	for (int i = 0; i < size; i++)
		dst[i] = src[i];
}


GPU mergeSort

In [None]:
%%cuda_group_save --name "mergesortGPU.cu" --group "ALG"
#include <stdlib.h>
#include <stdio.h>
#include <time.h>
#include "../../GPUcomputing/utils/common.h"

#define ARRAY_SIZE 64

void check_up_sorting(int *, unsigned);
void random_array(int *, unsigned, int);
void printArray(int *, int, int);
void mergeSort(int *, int, int);
void merge(int *, int, int, int);
void arrayCopy(int *, const int *, const int);

/**
 * Kernel: mergeSort with seq. merge
 */
__global__ void cudaMergeSort(int *array, int *sorted, int n, int chunk) {

	int start = chunk * (threadIdx.x + blockIdx.x * blockDim.x);
	if (start > n - chunk)
		return;

	int mid = start + chunk / 2;
	int end = start + chunk;
	int i = start, j = mid, k = start;

	//#cudaMerge(array, sorted, start, mid, end);
	while (i < mid && j < end)
		if (array[i] <= array[j])
			sorted[k++] = array[i++];
		else
			sorted[k++] = array[j++];

	//# Copy the remaining elements array[i] if there are any
	while (i < mid)
		sorted[k++] = array[i++];

	//# Copy the remaining elements of array[j] if there are any
	while (j < end)
		sorted[k++] = array[j++];
}

/**
 * A iterative binary search function. It returns the location p of
 * the first element in r-length arr[0..r-1] greater than x
 */
__device__ int binarySearch(int arr[], int x, int k, bool UP) {
	int l = 0, r = k;

	while (l < r) {
		int m = (l+r)/2;
		if (UP) {     //# for upper chunk B
			if (arr[m] <= x) l = m + 1;
			else r = m;
		}
		else {       //# for lower chunk A
			if (arr[m] < x) l = m + 1;
			else r = m;
		}
	}
	return l;
}

/**
 * Kernel: mergeSort with many threads. Each thread deals with 2 elements:
 *  A[i] in first chunk and the corresponding B[i] in the second chunk
 */
__global__ void cudaMergeSortMulti(int *array, int *sorted, int n, int k) {
	// k = 1,2,4,8,16,..., 2^m chunk dims
	int tid = threadIdx.x + blockIdx.x * blockDim.x;
	int j = tid % k;
	int l = (tid - j)*2;  //# first element of the fisrt chunk
	int i = l + j;        //# A[i] first chunk   [][][*][] and  B[i+k] [][][*][]

	if (k == 1) {
		l = 2*tid;
		i = l;
	}

	//# find the relative position of x within B[*]
	int x = array[i];
	int p = binarySearch(array+l+k, x, k, 1);
	sorted[i+p] = x;

	//# find the relative position of y within A[*]
	int y = array[i+k];
	p = binarySearch(array+l, y, k, 0);
	sorted[i+p] = y;
}

/*
 * MAIN
 */
int main(int argc, char** argv) {

	// Create the vector with the specified size and situation
	int *orig, *array, *sorted;
	int N = 4*1024*1024;         // must be a power of 2
	int BLOCK_SIZE = 32;

	printf("Sorting array size N = %d\n",N);

	// events to measure time
	cudaEvent_t start, stop;
	cudaEventCreate(&start);
	cudaEventCreate(&stop);

	// managed memory
	CHECK(cudaMallocManaged((void **)&array, N * sizeof(int)));
	CHECK(cudaMallocManaged((void **)&sorted, N * sizeof(int)));

	// random instance
	orig = (int *) malloc(N * sizeof(int));
	random_array(orig, N, 1);
	arrayCopy(array, orig, N);
	// printArray(array,N,16);

	/*****************************************************
	 *                      CPU                          *
	 *****************************************************/

  printf("** CPU processing...\n");
	double startTm = seconds();
	mergeSort(array, 0, N);
	double CPUtime = seconds() - startTm;
	printf("   CPU elapsed time: %.5f (sec)\n", CPUtime);
	check_up_sorting(array, N);

	/*****************************************************
	 *              ONE THREAD x chunk                   *
	 *****************************************************/

  printf("\n** GPU ONE THREAD x chunk processing...\n");
	arrayCopy(sorted, orig, N); // start from step 2
	bool array2sorted = false;
	CHECK(cudaEventRecord(start));
	for (int chunk = 2; chunk <= N; chunk *= 2) {
		int nThreads = N / chunk;
		dim3 block(min(nThreads, BLOCK_SIZE));
		dim3 grid((nThreads + block.x - 1) / block.x);

		if (array2sorted)
			cudaMergeSort<<<grid, block>>>(array, sorted, N, chunk);
		else
			cudaMergeSort<<<grid, block>>>(sorted, array, N, chunk);
		array2sorted = !array2sorted;
	}
	CHECK(cudaDeviceSynchronize());
	CHECK(cudaEventRecord(stop));
	CHECK(cudaEventSynchronize(stop));
	float milliseconds;
	cudaEventElapsedTime(&milliseconds, start, stop);
	float GPUtime = milliseconds / 1000.0;
	printf("   elapsed time:   %.5f (sec)\n", GPUtime);
	printf("   speedup vs CPU: %.2f\n", CPUtime / GPUtime);

	check_up_sorting(sorted, N);

	/*****************************************************
	 *              MULTI THREAD x chunk                 *
	 *****************************************************/

	printf("\n** GPU MULTI THREAD x chunk processing...\n");
	arrayCopy(array, orig, N);
	array2sorted = false;

	// grid set up
	int nThreads = N/2;
	dim3 block(min(nThreads, BLOCK_SIZE));
	dim3 grid((nThreads + block.x - 1) / block.x);
	CHECK(cudaEventRecord(start));
	for (int chunk = 1; chunk <= N/2; chunk *= 2) {
		array2sorted = !array2sorted;
		if (array2sorted)
			cudaMergeSortMulti<<<grid, block>>>(array, sorted, N, chunk);
		else
			cudaMergeSortMulti<<<grid, block>>>(sorted, array, N, chunk);
	}
	CHECK(cudaDeviceSynchronize());
	CHECK(cudaEventRecord(stop));
	CHECK(cudaEventSynchronize(stop));
	cudaEventElapsedTime(&milliseconds, start, stop);
	float GPUtime1 = milliseconds / 1000.0;
	printf("   elapsed time:        %.5f (sec)\n", GPUtime1);
	printf("   speedup vs CPU:      %.2f\n", CPUtime / GPUtime1);
	printf("   speedup vs GPU mono: %.2f\n", GPUtime / GPUtime1);
	if (!array2sorted) {
		int *swap = sorted;
		sorted = array;
		array = swap;
	}
	check_up_sorting(sorted, N);

	//	printArray(array,N,32);
	//	printArray(sorted,N,64);

	return 0;

}

In [None]:
# compile & RUN
!nvcc -arch=sm_75 src/ALG/mergesortGPU.cu src/ALG/mergesortCPU.cu -o mergesort
!./mergesort

# ✅ BitonicSort


 * Gli indici $k$ e $j$ in input hanno significato:
   * $k = 2,4,8,...,2^s=N$
   * $j = 2^{(k-1)}, 2^{(k-2)},...,1$ (parte dalla metà di k e continua a dimezzare)
 * Gli operatori sui bit ^ (XOR) e & (AND) vengono usati per filtrare i thread:
   * $ixj = i^j$  aggiunge o toglie a $i$ una potenza di 2, cioé $ixj = i \pm j$ con $j = 2^a$
   * $i \& k == 0$ vero sse $i \le k$ (sort ascendente) altrimenti sort discendente
 * L'operazione $ixj > i$ significa aggiorna solo quando l'indice $ixj$ fa un salto in avanti di $j=2^a$


In [None]:
%%cuda_group_save --name "bitonic.cu" --group "ALG"

#include <stdlib.h>
#include <stdio.h>
#include <time.h>

#include "../../GPUcomputing/utils/common.h"

#define THREADS 1024
#define BLOCKS 16*1024

__global__ void bitonic_sort_step(int *a, int j, int k) {
	unsigned int i, ixj;      // Sorting partners i and ixj
	i = threadIdx.x + blockDim.x * blockIdx.x;
	ixj = i ^ j;              // XOR: aggiunge o toglie a i una potenza di 2, j = 2^a

  #ifdef DEBUG
	if (i == 0)
		printf("ROUND: k = %d, j = %d\n", k, j);
  #endif

	if ((ixj) > i) {    // entra solo quando fa un salto di j = 2^a

    // Sort ascending
		if ((i & k) == 0) {
      #ifdef DEBUG
			printf("  UP  (ixj = %d\t    i = %d\t k = %d)   a[ixj] = %d - a[i] = %d\n", ixj, i, k, a[ixj],a[i]);
      #endif
			if (a[i] > a[ixj]) {
				int temp = a[i];
				a[i] = a[ixj];
				a[ixj] = temp;
			}
		}

		// Sort descending
		if ((i & k) != 0) {
      #ifdef DEBUG
			printf("  DOWN  (ixj = %d\t    i = %d\t k = %d)   a[ixj] = %d - a[i] = %d\n", ixj, i, k, a[ixj],a[i]);
      #endif
			if (a[i] < a[ixj]) {
				int temp = a[i];
				a[i] = a[ixj];
				a[ixj] = temp;
			}
		}
	}
}


/*
 The parameter dir indicates the sorting direction, ASCENDING
 or DESCENDING; if (a[i] > a[j]) agrees with the direction,
 then a[i] and a[j] are interchanged.
 */
void compAndSwap(int a[], int i, int j, int dir) {
	if (dir == (a[i] > a[j])) {
		int tmp = a[i];
		a[i] = a[j];
		a[j] = tmp;
	}
}

/*
 It recursively sorts a bitonic sequence in ascending order,
 if dir = 1, and in descending order otherwise (means dir=0).
 The sequence to be sorted starts at index position low,
 the parameter cnt is the number of elements to be sorted.
*/
void bitonicMerge(int a[], int low, int cnt, int dir) {
	if (cnt > 1) {
		int k = cnt / 2;
		for (int i = low; i < low + k; i++)
			compAndSwap(a, i, i + k, dir);
		bitonicMerge(a, low, k, dir);
		bitonicMerge(a, low + k, k, dir);
	}
}

/*
 This function first produces a bitonic sequence by recursively
 sorting its two halves in opposite sorting orders, and then
 calls bitonicMerge to make them in the same order
*/
void bitonicSort(int a[], int low, int cnt, int dir) {
	if (cnt > 1) {
		int k = cnt / 2;

		// sort in ascending order since dir here is 1
		bitonicSort(a, low, k, 1);

		// sort in descending order since dir here is 0
		bitonicSort(a, low + k, k, 0);

		// Will merge wole sequence in ascending order since dir = 1
		bitonicMerge(a, low, cnt, dir);
	}
}

/*
 * MAIN: test bitonic sort on CPU and GPU
 */
int main(void) {
	cudaEvent_t start, stop;
	cudaEventCreate(&start);
	cudaEventCreate(&stop);

	int N = THREADS*BLOCKS;
	// check
	if (!(N && !(N & (N - 1)))) {
		printf("ERROR: N must be power of 2 (N = %d)\n", N);
		exit(1);
	}
	size_t nBytes = N * sizeof(int);
	int *a = (int*) malloc(nBytes);
	int *b = (int*) malloc(nBytes);

	// fill data
	for (int i = 0; i < N; ++i) {
		a[i] =  rand() % 100;
		b[i] = a[i];
	}

  /*****************************************************
	 *                   bitonic CPU                     *
	 *****************************************************/

	double cpu_time = seconds();
	bitonicSort(b, 0, N, 1);   // 1 means sort in ascending order
	printf("CPU elapsed time: %.5f (sec)\n", seconds()-cpu_time);

  /*****************************************************
	 *                   bitonic GPU                     *
	 *****************************************************/

	// device mem copy
	int *d_a;
	CHECK(cudaMalloc((void**) &d_a, nBytes));
	CHECK(cudaMemcpy(d_a, a, nBytes, cudaMemcpyHostToDevice));

	// num of threads
	dim3 blocks(BLOCKS, 1);   // Number of blocks
	dim3 threads(THREADS, 1); // Number of threads

	// start computation
	cudaEventRecord(start);
	int j, k;
	// external loop on comparators of size k
	for (k = 2; k <= N; k <<= 1) {
		// internal loop for comparator internal stages
		for (j = k >> 1; j > 0; j = j >> 1)
			bitonic_sort_step<<<blocks, threads>>>(d_a, j, k);
	}
	cudaEventRecord(stop);
	cudaEventSynchronize(stop);
	float milliseconds = 0;
	cudaEventElapsedTime(&milliseconds, start, stop);
	printf("GPU elapsed time: %.5f (sec)\n", milliseconds / 1000);

	// recover data
	cudaMemcpy(a, d_a, nBytes, cudaMemcpyDeviceToHost);

	// print & check
	if (N < 100) {
		printf("GPU:\n");
		for (int i = 0; i < N; ++i)
			printf("%d  ", a[i]);
		printf("\nCPU:\n");
		for (int i = 0; i < N; ++i)
			printf("%d  ", b[i]);
	}
	else {
		for (int i = 0; i < N; ++i) {
			if (a[i] != b[i]) {
				printf("ERROR a[%d] != b[%d]  (a[i] = %d  -  b[i] = %d\n", i,i, a[i],b[i]);
				break;
			}
		}
	}

	cudaFree(d_a);

	exit(0);
}

In [None]:
# for debugging (small vectors)
#!nvcc -arch=sm_75 -DDEBUG sorting/bitonic.cu -o bitonic

# for test on big vectors
!nvcc -arch=sm_75 src/ALG/bitonic.cu -o bitonic
!./bitonic