---
# **LAB 10 - Parallel patterns**
---

# ▶️ CUDA setup

In [None]:
!nvcc --version

In [None]:
!nvidia-smi

GPU computing notebooks download (from github)

In [None]:
!git clone https://github.com/giulianogrossi/GPUcomputing.git

NVCC Plugin for Jupyter notebook

In [None]:
%cd GPUcomputing/utils/nvcc4jupyter-master/
!!python3 -m build
%load_ext nvcc4jupyter
%cd /content/

# ✅ scan

↘️ **SOL...**

In [None]:
%%cuda_group_save --name "scan.cu" --group "lab10"
#include <stdlib.h>
#include <stdio.h>

#include "../../GPUcomputing/utils/common.h"

#define BLOCK_SIZE 32

__global__ void block_scan(int *input, int *output) {
   __shared__ int smem[BLOCK_SIZE];
   int tid = threadIdx.x;

   // Load input into shared memory
   smem[tid] = input[tid];
   __syncthreads();

   // Perform an inclusive scan within the block
   // Each thread adds the value from a previous offset (d) to its current value in SMEM
   for (int d = 1; d < BLOCK_SIZE; d *= 2) {
      if (tid >= d)
         smem[tid] += smem[tid - d];
      __syncthreads();
   }

   // Write the result back to global memory
   output[tid] = smem[tid];
}


__global__ void block_scan_we(int *input, int *output) {
  __shared__ int smem[BLOCK_SIZE];
  int tid = threadIdx.x;
  int tid_odd = tid * 2;

   // load input into shared memory
	smem[tid_odd] = input[tid_odd];          // odd indexed elements
	smem[tid_odd + 1] = input[tid_odd + 1];  // even indexed elements

	int offset = 1;
  // build sum in place up the tree (on SMEM)
	for (int d = BLOCK_SIZE/2; d > 0; d >>= 1) {
    __syncthreads();
		if (tid < d) {
      int L = offset * (tid_odd + 1) - 1;
			int R = offset * (tid_odd + 2) - 1;
			smem[R] += smem[L];
		}
		offset *= 2;
	}

  // save & clear the last element
  int last_elem;
	if (tid == BLOCK_SIZE/2-1) {
    last_elem = smem[BLOCK_SIZE - 1];
    smem[BLOCK_SIZE - 1] = 0;      // exclusive scan
   }

   // traverse down tree again to get the prefix sums
	for (int d = 1; d < BLOCK_SIZE; d *= 2) {
      offset >>= 1;
		__syncthreads();
		if (tid < d) {
      int L = offset * (tid_odd + 1) - 1;
			int R = offset * (tid_odd + 2) - 1;
			int t = smem[L];
			smem[L] = smem[R];
			smem[R] += t;
		}
	}
	__syncthreads();

  // write results to device memory
  if (tid < (BLOCK_SIZE/2-1)) {
    output[tid_odd] = smem[tid_odd + 1];
    output[tid_odd + 1] = smem[tid_odd + 2];
  }
  else {
    output[tid_odd] = smem[tid_odd + 1];
    output[tid_odd + 1] = last_elem; // last element
  }
}

// Kernel to perform exclusive scan
// using shared memory
int main() {
  int N = BLOCK_SIZE;         // Number of elements
  int *in = new int[N];
  int *out = new int[N];
  int *out_cpu = new int[N];

  // Initialize input array
	for (int i = 0; i < N; i++)
    in[i] = 1;

  // cpu scan for verification
  out_cpu[0] = in[0];
  for (int i = 1; i < N; i++)
    out_cpu[i] = out_cpu[i-1] + in[i];


  // Allocate device memory
  int *d_out, *d_in;
	const int arraySize = N * sizeof(int);
	cudaMalloc(&d_out, arraySize);
	cudaMalloc(&d_in, arraySize);
	cudaMemcpy(d_in, in, arraySize, cudaMemcpyHostToDevice);

	// start timer for kernel execution
	cudaEvent_t start, stop;
	cudaEventCreate(&start);
	cudaEventCreate(&stop);

  /***********************************************************/
	/*                block_scan kernel                        */
	/***********************************************************/

  cudaEventRecord(start);
  block_scan<<< 1, BLOCK_SIZE>>>(d_in, d_out);
  cudaEventRecord(stop);
  cudaEventSynchronize(stop);
  float elapsedTime = 0;
  cudaEventElapsedTime(&elapsedTime, start, stop);
  printf(" block_scan elaps time: %f ms\n", elapsedTime);
  cudaMemset(d_out, 0, arraySize); // clear output buffer

  /***********************************************************/
	/*               block_scan work eff. kernel               */
	/***********************************************************/

  cudaEventRecord(start);
  block_scan_we<<< 1, BLOCK_SIZE/2>>>(d_in, d_out);
  cudaEventRecord(stop);
  cudaEventSynchronize(stop);
  float elapsedTime1 = 0;
  cudaEventElapsedTime(&elapsedTime1, start, stop);
  printf(" block_scan w.e. elaps time: %f ms\n", elapsedTime1);

  // Copy result back to host
  cudaMemcpy(out, d_out, arraySize, cudaMemcpyDeviceToHost);

  // check results
  for (int i = 0; i < N; i++) {
    if (out[i] != out_cpu[i]) {
        printf("Error: out[%d] = %d, expected %d\n", i, out[i], out_cpu[i]);
        break;
    }
  }
  printf("Results are correct!\n");

	cudaFree(d_out);
	cudaFree(d_in);
	cudaEventDestroy(start);
	cudaEventDestroy(stop);

   return 0;
}

↘️ **TODO...**

In [None]:
#include <stdlib.h>
#include <stdio.h>

#define BLOCK_SIZE 32

__global__ void block_scan(int *input, int *output) {
   __shared__ int smem[BLOCK_SIZE];
   int tid = threadIdx.x;

   // Load input into shared memory
   smem[tid] = input[tid];
   __syncthreads();

   // Perform an inclusive scan within the block
   // Each thread adds the value from a previous offset (d) to its current value in SMEM
   for (int d = 1; d < BLOCK_SIZE; d *= 2) {
      if (tid >= d)
         smem[tid] += smem[tid - d];
      __syncthreads();
   }

   // Write the result back to global memory
   output[tid] = smem[tid];
}


__global__ void block_scan_we(int *input, int *output) {

   // load input into shared memory

   // build sum in place up the tree (on SMEM)

   // save & clear the last element

   // traverse down tree again to get the prefix sums

   // write results to device memory

}

// Kernel to perform exclusive scan
// using shared memory
int main() {
  int N = BLOCK_SIZE;         // Number of elements
  int *in = new int[N];
  int *out = new int[N];
  int *out_cpu = new int[N];

  // Initialize input array
  for (int i = 0; i < N; i++)
    in[i] = 1;

  // cpu scan for verification
  double start = seconds();
  out_cpu[0] = in[0];
  for (int i = 1; i < N; i++)
    out_cpu[i] = out_cpu[i-1] + in[i];
  double stopCPU = seconds() - start;
  printf("   Host elapsed time: %f\n", stopCPU);

  // Allocate device memory
  int *d_out, *d_in;
	const int arraySize = N * sizeof(int);
	cudaMalloc(&d_out, arraySize);
	cudaMalloc(&d_in, arraySize);
	cudaMemcpy(d_in, in, arraySize, cudaMemcpyHostToDevice);

	// start timer for kernel execution
	cudaEvent_t start, stop;
	cudaEventCreate(&start);
	cudaEventCreate(&stop);

  /***********************************************************/
	/*                block_scan kernel                        */
	/***********************************************************/

  cudaEventRecord(start);
  block_scan<<< 1, BLOCK_SIZE>>>(d_in, d_out);
  cudaEventRecord(stop);
  cudaEventSynchronize(stop);
  float elapsedTime = 0;
  cudaEventElapsedTime(&elapsedTime, start, stop);
  printf(" block_scan elaps time: %f ms\n", elapsedTime);
  cudaMemset(d_out, 0, arraySize); // clear output buffer

  /***********************************************************/
	/*               block_scan work eff. kernel               */
	/***********************************************************/

  cudaEventRecord(start);
  block_scan_we<<< 1, BLOCK_SIZE/2>>>(d_in, d_out);
  cudaEventRecord(stop);
  cudaEventSynchronize(stop);
  float elapsedTime1 = 0;
  cudaEventElapsedTime(&elapsedTime1, start, stop);
  printf(" block_scan w.e. elaps time: %f ms\n", elapsedTime1);

  // Copy result back to host
  cudaMemcpy(out, d_out, arraySize, cudaMemcpyDeviceToHost);

  // check results
  for (int i = 0; i < N; i++) {
      if (out[i] != out_cpu[i]) {
        printf("Error: out[%d] = %d, expected %d\n", i, out[i], out_cpu[i]);
        break;
      }
  }
  printf("Results are correct!\n");

	cudaFree(d_out);
	cudaFree(d_in);
	cudaEventDestroy(start);
	cudaEventDestroy(stop);

   return 0;
}

↩ **Run...**

In [None]:
!nvcc -arch=sm_75 src/lab10/scan.cu -o scan
!./scan

# ✅ Luby coloring


↘️ **SOL...**

In [None]:
%%cuda_group_save --name "luby.cu" --group "lab10"
#include <iostream>
#include "coloring.h"

#define BLOCK 128

using namespace std;

/**
 * find an IS
 */
__global__ void findIS (Coloring* col, GraphStruct *str, uint* weights) {
	uint idx = threadIdx.x + blockDim.x * blockIdx.x;

	if (idx >= str->nodeSize)
		return;

	if (col->coloring[idx])
		return;

	uint offset = str->cumDegs[idx];
	uint deg = str->cumDegs[idx + 1] - str->cumDegs[idx];

	bool candidate = true;
	for (uint j = 0; j < deg; j++) {
		uint neighID = str->neighs[offset + j];
		if (!col->coloring[neighID] &&
				((weights[idx] < weights[neighID]) ||
				((weights[idx] == weights[neighID]) && idx < neighID))) {
			candidate = false;
		}
	}
	if (candidate) {
		col->coloring[idx] = col->numOfColors;
	}
	else
		col->uncoloredNodes = true;
}

/**
 *  this GPU kernel takes an array of states, and an array of ints, and puts a random int into each
 */
__global__ void init (uint seed, curandState_t* states, uint* numbers, uint n) {
	uint idx = blockIdx.x * blockDim.x + threadIdx.x;
	if (idx > n)
			return;
	curand_init(seed, idx, 0, &states[idx]);
	numbers[idx] = curand(&states[idx])%n*n;
}

/**
 * Print the graph (verbose = 1 for "verbose print")
 * @param verbose print the complete graph
 */
void printColoring (Coloring* col, GraphStruct* str, bool verbose) {
	node n = str->nodeSize;
	cout << "** Graph (num node: " << n << ", num edges: " << str->edgeSize << ")" << endl;
	cout << "** Coloring (num colors: " << col->numOfColors << ")" << endl;
	if (verbose) {
		for (int i = 1; i <= col->numOfColors; i++) {
			cout << "   color(" << i << ")" << "-> ";
			for (int j = 0; j < n; j++)
				if (col->coloring[j] == i)
					cout << j << " ";
			cout << "\n";
		}
		cout << "\n";
	}
}

/**
* MAIN
 */
int main(void) {
	unsigned int n = 1000;		 // number of nodes for random graphs
	float prob = .5;				    // density (percentage) for random graphs
	std::default_random_engine eng{0};  // fixed seed

	// new graph with n nodes
	Graph graph(n,1);

	// generate a random graph
	graph.randGraph(prob,eng);

	// get the graph struct
	GraphStruct *str = graph.getStruct();
   cout << "** Graph (num node: " << n << ", num edges: " << str->edgeSize << ")" << endl;

	// print small graph
	if (n <= 20) {
		graph.print(true);  // CPU print
		print_d<<< 1, 1 >>>(str, true);  // GPU print
	}

   Coloring* col;
	CHECK(cudaMallocManaged(&col, sizeof(Coloring)));
	col->uncoloredNodes = true;

	// cudaMalloc for arrays of struct Coloring
	CHECK(cudaMallocManaged( &(col->coloring), n * sizeof(uint)));
	memset(col->coloring,0,n);

	// allocate space on the GPU for the random states
	curandState_t* states;
	uint* weigths;
	cudaMalloc((void**) &states, n * sizeof(curandState_t));
	cudaMalloc((void**) &weigths, n * sizeof(uint));
	dim3 threads (BLOCK);
	dim3 blocks ((str->nodeSize + threads.x - 1) / threads.x, 1, 1 );
	uint seed = 0;
	init <<< blocks, threads >>> (seed, states, weigths, n);

   // start timer for kernel execution
	cudaEvent_t start, stop;
	cudaEventCreate(&start);
	cudaEventCreate(&stop);
	cudaEventRecord(start);

	// loop on ISs covering the graph
	col->numOfColors = 0;
	while (col->uncoloredNodes) {
		col->uncoloredNodes = false;
		col->numOfColors++;
		findIS <<< blocks, threads >>> (col, str, weigths);
      cudaDeviceSynchronize();
	}
   cudaEventRecord(stop);
	cudaEventSynchronize(stop);
	float elapsedTime = 0;
	cudaEventElapsedTime(&elapsedTime, start, stop);
   cout << "   elaps time: " << elapsedTime/1000.0f << " (sec)" << endl;

   // print coloring
	cout << "   num colors: " << col->numOfColors << endl;
   if (n <= 20) {
      printColoring(col, str, 1);
   }

	cudaFree(states);
	cudaFree(weigths);


	return EXIT_SUCCESS;
}

↘️ **TODO...**

TEST on Luby


Graph print layout:
```
** Graph (num node: 10, num edges: 46)
      (min deg: 3, max deg: 7, mean deg: 4.6, connected: 1)
      node(0)[5]-> 1 2 3 7 9
      node(1)[3]-> 0 2 9
      node(2)[5]-> 0 1 3 5 8
      node(3)[7]-> 0 2 4 5 6 7 9
      node(4)[5]-> 3 6 7 8 9
      node(5)[5]-> 2 3 6 8 9
      node(6)[4]-> 3 4 5 8
      node(7)[3]-> 0 3 4
      node(8)[4]-> 2 4 5 6
      node(9)[5]-> 0 1 3 4 5
```


Coloring print layout:
```
** Graph (num node: 10, num edges: 36)
** Coloring (num colors: 6)
    color(1)-> 1 3 8
    color(2)-> 0 6
    color(3)-> 4 7
    color(4)-> 9
    color(5)-> 5
    color(6)-> 2
```




In [None]:
%%cuda_group_save --name "luby.cu" --group "lab10"
#include <iostream>
#include "coloring.h"

#define BLOCK 128

using namespace std;

/**
 * find an IS
 */
__global__ void findIS (Coloring* col, GraphStruct *str, uint* weights) {

	// TODO

}

/**
 *  this GPU kernel takes an array of states, and an array of ints, and puts a random int into each
 */
__global__ void init (uint seed, curandState_t* states, uint* numbers, uint n) {
	uint idx = blockIdx.x * blockDim.x + threadIdx.x;
	if (idx > n)
			return;
	curand_init(seed, idx, 0, &states[idx]);
	numbers[idx] = curand(&states[idx])%n*n;
}

/**
 * Print the graph (verbose = 1 for "verbose print")
 * @param verbose print the complete graph
 */
void printColoring (Coloring* col, GraphStruct* str, bool verbose) {
	node n = str->nodeSize;
	cout << "** Graph (num node: " << n << ", num edges: " << str->edgeSize << ")" << endl;
	cout << "** Coloring (num colors: " << col->numOfColors << ")" << endl;
	if (verbose) {
		for (int i = 1; i <= col->numOfColors; i++) {
			cout << "   color(" << i << ")" << "-> ";
			for (int j = 0; j < n; j++)
				if (col->coloring[j] == i)
					cout << j << " ";
			cout << "\n";
		}
		cout << "\n";
	}
}

/**
* MAIN
 */
int main(void) {
	unsigned int n = 1000;		 // number of nodes for random graphs
	float prob = .5;				    // density (percentage) for random graphs
	std::default_random_engine eng{0};  // fixed seed

	// new graph with n nodes
	Graph graph(n,1);

	// generate a random graph
	graph.randGraph(prob,eng);

	// get the graph struct
	GraphStruct *str = graph.getStruct();
   cout << "** Graph (num node: " << n << ", num edges: " << str->edgeSize << ")" << endl;

	// print small graph
	if (n <= 20) {
		graph.print(true);  // CPU print
		print_d<<< 1, 1 >>>(str, true);  // GPU print
	}

   Coloring* col;
	CHECK(cudaMallocManaged(&col, sizeof(Coloring)));
	col->uncoloredNodes = true;

	// cudaMalloc for arrays of struct Coloring
	CHECK(cudaMallocManaged( &(col->coloring), n * sizeof(uint)));
	memset(col->coloring,0,n);

	// allocate space on the GPU for the random states
	curandState_t* states;
	uint* weigths;
	cudaMalloc((void**) &states, n * sizeof(curandState_t));
	cudaMalloc((void**) &weigths, n * sizeof(uint));
	dim3 threads (BLOCK);
	dim3 blocks ((str->nodeSize + threads.x - 1) / threads.x, 1, 1 );
	uint seed = 0;
	init <<< blocks, threads >>> (seed, states, weigths, n);

   // start timer for kernel execution
	cudaEvent_t start, stop;
	cudaEventCreate(&start);
	cudaEventCreate(&stop);
	cudaEventRecord(start);

	// loop on ISs covering the graph


	// TODO

  cudaEventRecord(stop);
	cudaEventSynchronize(stop);
	float elapsedTime = 0;
	cudaEventElapsedTime(&elapsedTime, start, stop);
   cout << "   elaps time: " << elapsedTime/1000.0f << " (sec)" << endl;

  // print coloring
	cout << "   num colors: " << col->numOfColors << endl;
   if (n <= 20) {
      printColoring(col, str, 1);
   }

	cudaFree(states);
	cudaFree(weigths);


	return EXIT_SUCCESS;
}

↩ **Run...**

In [None]:
!nvcc -arch=sm_75 src/lab10/luby.cu -o luby -I GPUcomputing/lab10 GPUcomputing/utils/graph/graph.cpp GPUcomputing/utils/graph/graph_d.cu
!./luby

# ✅ MergeSort

↘️ **SOL...**

In [None]:
%%cuda_group_save --name "merge_sort.cu" --group "lab10"
#include <stdlib.h>
#include <stdio.h>
#include <time.h>
#include "../../GPUcomputing/utils/common.h"

#define ARRAY_SIZE 64

void check_up_sorting(int *, unsigned);
void random_array(int *, unsigned, int);
void printArray(int *, int, int);
void mergeSort(int *, int, int);
void merge(int *, int, int, int);
void arrayCopy(int *, const int *, const int);

/**
 * Kernel: mergeSort with seq. merge
 */
__global__ void cudaMergeSort(int *array, int *sorted, int n, int chunk) {

	int start = chunk * (threadIdx.x + blockIdx.x * blockDim.x);
	if (start > n - chunk)
		return;

	int mid = start + chunk / 2;
	int end = start + chunk;
	int i = start, j = mid, k = start;

	// cudaMerge(array, sorted, start, mid, end);
	while (i < mid && j < end)
		if (array[i] <= array[j])
			sorted[k++] = array[i++];
		else
			sorted[k++] = array[j++];

	//  Copy the remaining elements array[i] if there are any
	while (i < mid)
		sorted[k++] = array[i++];

	// Copy the remaining elements of array[j] if there are any
	while (j < end)
		sorted[k++] = array[j++];
}

/**
 * A iterative binary search function. It returns the location p of
 * the first element in r-length arr[0..r-1] greater than x
 */
__device__ int binarySearch(int arr[], int x, int k, bool UP) {
	int l = 0, r = k;

	while (l < r) {
		int m = (l+r)/2;
		if (UP) {     //# for upper chunk B
			if (arr[m] <= x) l = m + 1;
			else r = m;
		}
		else {       //# for lower chunk A
			if (arr[m] < x) l = m + 1;
			else r = m;
		}
	}
	return l;
}

/**
 * Kernel: mergeSort with many threads. Each thread deals with 2 elements:
 *  A[i] in first chunk and the corresponding B[i] in the second chunk
 */
__global__ void cudaMergeSortMulti(int *array, int *sorted, int n, int k) {
	// k = 2,4,8,16,..., 2^m chunk dims
	int tid = threadIdx.x + blockIdx.x * blockDim.x;
	int j = tid % k;
	int l = (tid - j)*2;  // first element of the first chunk
	int i = l + j;        // A[i] first chunk   [][][*][] and  B[i+k] [][][*][]

	if (k == 1) {
		l = 2*tid;
		i = l;
	}

	//# find the relative position of x within B[*]
	int x = array[i];
	int p = binarySearch(array+l+k, x, k, 1);
	sorted[i+p] = x;

	//# find the relative position of y within A[*]
	int y = array[i+k];
	p = binarySearch(array+l, y, k, 0);
	sorted[i+p] = y;
}

/*
 * MAIN
 */
int main(int argc, char** argv) {

	// Create the vector with the specified size and situation
	int *orig, *array, *sorted;
	int N = 4*1024*1024;         // must be a power of 2
	int BLOCK_SIZE = 32;

	printf("Sorting array size N = %d\n",N);

	// events to measure time
	cudaEvent_t start, stop;
	cudaEventCreate(&start);
	cudaEventCreate(&stop);

	// managed memory
	CHECK(cudaMallocManaged((void **)&array, N * sizeof(int)));
	CHECK(cudaMallocManaged((void **)&sorted, N * sizeof(int)));

	// random instance
	orig = (int *) malloc(N * sizeof(int));
	random_array(orig, N, 1);
	arrayCopy(array, orig, N);
	// printArray(array,N,16);

	/*****************************************************
	 *                      CPU                          *
	 *****************************************************/

  printf("** CPU processing...\n");
	double startTm = seconds();
	mergeSort(array, 0, N);
	double CPUtime = seconds() - startTm;
	printf("   CPU elapsed time: %.5f (sec)\n", CPUtime);
	check_up_sorting(array, N);

	/*****************************************************
	 *              ONE THREAD x chunk                   *
	 *****************************************************/

  printf("\n** GPU ONE THREAD x chunk processing...\n");
	arrayCopy(sorted, orig, N); // start from step 2
	bool array2sorted = false;
	CHECK(cudaEventRecord(start));
	for (int chunk = 2; chunk <= N; chunk *= 2) {
		int nThreads = N / chunk;
		dim3 block(min(nThreads, BLOCK_SIZE));
		dim3 grid((nThreads + block.x - 1) / block.x);

		if (array2sorted)
			cudaMergeSort<<<grid, block>>>(array, sorted, N, chunk);
		else
			cudaMergeSort<<<grid, block>>>(sorted, array, N, chunk);
		array2sorted = !array2sorted;
	}
	CHECK(cudaDeviceSynchronize());
	CHECK(cudaEventRecord(stop));
	CHECK(cudaEventSynchronize(stop));
	float milliseconds;
	cudaEventElapsedTime(&milliseconds, start, stop);
	float GPUtime = milliseconds / 1000.0;
	printf("   elapsed time:   %.5f (sec)\n", GPUtime);
	printf("   speedup vs CPU: %.2f\n", CPUtime / GPUtime);

	check_up_sorting(sorted, N);

	/*****************************************************
	 *              MULTI THREAD x chunk                 *
	 *****************************************************/
	printf("\n** GPU MULTI THREAD x chunk processing...\n");
	arrayCopy(array, orig, N);
	array2sorted = false;

	// grid set up
	int nThreads = N/2;
	dim3 block(min(nThreads, BLOCK_SIZE));
	dim3 grid((nThreads + block.x - 1) / block.x);
	CHECK(cudaEventRecord(start));
	for (int chunk = 1; chunk <= N/2; chunk *= 2) {
		array2sorted = !array2sorted;
		if (array2sorted)
			cudaMergeSortMulti<<<grid, block>>>(array, sorted, N, chunk);
		else
			cudaMergeSortMulti<<<grid, block>>>(sorted, array, N, chunk);
	}
	CHECK(cudaDeviceSynchronize());
	CHECK(cudaEventRecord(stop));
	CHECK(cudaEventSynchronize(stop));
	cudaEventElapsedTime(&milliseconds, start, stop);
	float GPUtime1 = milliseconds / 1000.0;
	printf("   elapsed time:        %.5f (sec)\n", GPUtime1);
	printf("   speedup vs CPU:      %.2f\n", CPUtime / GPUtime1);
	printf("   speedup vs GPU mono: %.2f\n", GPUtime / GPUtime1);
	if (!array2sorted) {
		int *swap = sorted;
		sorted = array;
		array = swap;
	}
	check_up_sorting(sorted, N);

//	printArray(array,N,32);
//	printArray(sorted,N,64);

	return 0;
}

↘️ **TODO...**

In [None]:
%%cuda_group_save --name "merge_sort.cu" --group "lab10"
#include <stdlib.h>
#include <stdio.h>
#include <time.h>
#include "../../GPUcomputing/utils/common.h"

#define ARRAY_SIZE 64

void check_up_sorting(int *, unsigned);
void random_array(int *, unsigned, int);
void printArray(int *, int, int);
void mergeSort(int *, int, int);
void merge(int *, int, int, int);
void arrayCopy(int *, const int *, const int);

/**
 * Kernel: mergeSort with seq. merge
 */
__global__ void cudaMergeSort(int *array, int *sorted, int n, int chunk) {

	int start = chunk * (threadIdx.x + blockIdx.x * blockDim.x);
	if (start > n - chunk)
		return;

	int mid = start + chunk / 2;
	int end = start + chunk;
	int i = start, j = mid, k = start;

	// cudaMerge(array, sorted, start, mid, end);
	while (i < mid && j < end)
		if (array[i] <= array[j])
			sorted[k++] = array[i++];
		else
			sorted[k++] = array[j++];

	//  Copy the remaining elements array[i] if there are any
	while (i < mid)
		sorted[k++] = array[i++];

	// Copy the remaining elements of array[j] if there are any
	while (j < end)
		sorted[k++] = array[j++];
}

/**
 * A iterative binary search function. It returns the location p of
 * the first element in r-length arr[0..r-1] greater than x
 */
__device__ int binarySearch(int arr[], int x, int k, bool UP) {
	int l = 0, r = k;

	while (l < r) {
		int m = (l+r)/2;
		if (UP) {     //# for upper chunk B
			if (arr[m] <= x) l = m + 1;
			else r = m;
		}
		else {       //# for lower chunk A
			if (arr[m] < x) l = m + 1;
			else r = m;
		}
	}
	return l;
}

/**
 * Kernel: mergeSort with many threads. Each thread deals with 2 elements:
 *  A[i] in first chunk and the corresponding B[i] in the second chunk
 */
__global__ void cudaMergeSortMulti(int *array, int *sorted, int n, int k) {

	// TODO

 }

/*
 * MAIN
 */
int main(int argc, char** argv) {

	// Create the vector with the specified size and situation
	int *orig, *array, *sorted;
	int N = 4*1024*1024;         // must be a power of 2
	int BLOCK_SIZE = 32;

	printf("Sorting array size N = %d\n",N);

	// events to measure time
	cudaEvent_t start, stop;
	cudaEventCreate(&start);
	cudaEventCreate(&stop);

	// managed memory
	CHECK(cudaMallocManaged((void **)&array, N * sizeof(int)));
	CHECK(cudaMallocManaged((void **)&sorted, N * sizeof(int)));

	// random instance
	orig = (int *) malloc(N * sizeof(int));
	random_array(orig, N, 1);
	arrayCopy(array, orig, N);
	// printArray(array,N,16);

	/*****************************************************
	 *                      CPU                          *
	 *****************************************************/

  printf("** CPU processing...\n");
	double startTm = seconds();
	mergeSort(array, 0, N);
	double CPUtime = seconds() - startTm;
	printf("   CPU elapsed time: %.5f (sec)\n", CPUtime);
	check_up_sorting(array, N);

	/*****************************************************
	 *              ONE THREAD x chunk                   *
	 *****************************************************/

  printf("\n** GPU ONE THREAD x chunk processing...\n");
	arrayCopy(sorted, orig, N); // start from step 2
	bool array2sorted = false;
	CHECK(cudaEventRecord(start));
	for (int chunk = 2; chunk <= N; chunk *= 2) {
		int nThreads = N / chunk;
		dim3 block(min(nThreads, BLOCK_SIZE));
		dim3 grid((nThreads + block.x - 1) / block.x);

		if (array2sorted)
			cudaMergeSort<<<grid, block>>>(array, sorted, N, chunk);
		else
			cudaMergeSort<<<grid, block>>>(sorted, array, N, chunk);
		array2sorted = !array2sorted;
	}
	CHECK(cudaDeviceSynchronize());
	CHECK(cudaEventRecord(stop));
	CHECK(cudaEventSynchronize(stop));
	float milliseconds;
	cudaEventElapsedTime(&milliseconds, start, stop);
	float GPUtime = milliseconds / 1000.0;
	printf("   elapsed time:   %.5f (sec)\n", GPUtime);
	printf("   speedup vs CPU: %.2f\n", CPUtime / GPUtime);

	check_up_sorting(sorted, N);

	/*****************************************************
	 *              MULTI THREAD x chunk                 *
	 *****************************************************/
	printf("\n** GPU MULTI THREAD x chunk processing...\n");
	arrayCopy(array, orig, N);
	array2sorted = false;

	// grid set up
	int nThreads = N/2;
	dim3 block(min(nThreads, BLOCK_SIZE));
	dim3 grid((nThreads + block.x - 1) / block.x);
	CHECK(cudaEventRecord(start));

		// TODO

	CHECK(cudaDeviceSynchronize());
	CHECK(cudaEventRecord(stop));
	CHECK(cudaEventSynchronize(stop));
	cudaEventElapsedTime(&milliseconds, start, stop);
	float GPUtime1 = milliseconds / 1000.0;
	printf("   elapsed time:        %.5f (sec)\n", GPUtime1);
	printf("   speedup vs CPU:      %.2f\n", CPUtime / GPUtime1);
	printf("   speedup vs GPU mono: %.2f\n", GPUtime / GPUtime1);
	if (!array2sorted) {
		int *swap = sorted;
		sorted = array;
		array = swap;
	}
	check_up_sorting(sorted, N);

//	printArray(array,N,32);
//	printArray(sorted,N,64);

	return 0;
}

↩ **Run...**

In [None]:
!nvcc -arch=sm_75 src/lab10/merge_sort.cu GPUcomputing/utils/merge_sort_CPU.cpp -o mergesort
!./mergesort

# ✅ BitonicSort

↘️ **SOL...**


 * Gli indici $k$ e $j$ in input hanno significato:
   * $k = 2,4,8,...,2^s=N$
   * $j = 2^{(k-1)}, 2^{(k-2)},...,1$ (parte dalla metà di k e continua a dimezzare)
 * Gli operatori sui bit ^ (XOR) e & (AND) vengono usati per filtrare i thread:
   * $ixj = i^j$  aggiunge o toglie a $i$ una potenza di 2, cioé $ixj = i \pm j$ con $j = 2^a$
   * $i \& k == 0$ vero sse $i \le k$ (sort ascendente) altrimenti sort discendente
 * L'operazione $ixj > i$ significa aggiorna solo quando l'indice $ixj$ fa un salto in avanti di $j=2^a$


In [None]:
%%cuda_group_save --name "bitonic.cu" --group "lab10"

#include <stdlib.h>
#include <stdio.h>
#include <time.h>

#include "../../GPUcomputing/utils/common.h"

#define THREADS 1024
#define BLOCKS 16*1024

__global__ void bitonic_sort_step(int *a, int j, int k) {
	unsigned int i, ixj;      // Sorting partners i and ixj
	i = threadIdx.x + blockDim.x * blockIdx.x;
	ixj = i ^ j;              // XOR: aggiunge o toglie a i una potenza di 2, j = 2^a

  #ifdef DEBUG
	if (i == 0)
		printf("ROUND: k = %d, j = %d\n", k, j);
  #endif

	if ((ixj) > i) {    // entra solo quando fa un salto di j = 2^a

    // Sort ascending
		if ((i & k) == 0) {
      #ifdef DEBUG
			printf("  UP  (ixj = %d\t    i = %d\t k = %d)   a[ixj] = %d - a[i] = %d\n", ixj, i, k, a[ixj],a[i]);
      #endif
			if (a[i] > a[ixj]) {
				int temp = a[i];
				a[i] = a[ixj];
				a[ixj] = temp;
			}
		}

		// Sort descending
		if ((i & k) != 0) {
      #ifdef DEBUG
			printf("  DOWN  (ixj = %d\t    i = %d\t k = %d)   a[ixj] = %d - a[i] = %d\n", ixj, i, k, a[ixj],a[i]);
      #endif
			if (a[i] < a[ixj]) {
				int temp = a[i];
				a[i] = a[ixj];
				a[ixj] = temp;
			}
		}
	}
}

/*The parameter dir indicates the sorting direction, ASCENDING
 or DESCENDING; if (a[i] > a[j]) agrees with the direction,
 then a[i] and a[j] are interchanged.*/
void compAndSwap(int a[], int i, int j, int dir) {
	if (dir == (a[i] > a[j])) {
		int tmp = a[i];
		a[i] = a[j];
		a[j] = tmp;
	}
}

/*It recursively sorts a bitonic sequence in ascending order,
 if dir = 1, and in descending order otherwise (means dir=0).
 The sequence to be sorted starts at index position low,
 the parameter cnt is the number of elements to be sorted.*/
void bitonicMerge(int a[], int low, int cnt, int dir) {
	if (cnt > 1) {
		int k = cnt / 2;
		for (int i = low; i < low + k; i++)
			compAndSwap(a, i, i + k, dir);
		bitonicMerge(a, low, k, dir);
		bitonicMerge(a, low + k, k, dir);
	}
}

/* This function first produces a bitonic sequence by recursively
 sorting its two halves in opposite sorting orders, and then
 calls bitonicMerge to make them in the same order */
void bitonicSort(int a[], int low, int cnt, int dir) {
	if (cnt > 1) {
		int k = cnt / 2;

		// sort in ascending order since dir here is 1
		bitonicSort(a, low, k, 1);

		// sort in descending order since dir here is 0
		bitonicSort(a, low + k, k, 0);

		// Will merge wole sequence in ascending order
		// since dir=1.
		bitonicMerge(a, low, cnt, dir);
	}
}

/*
 * MAIN: test bitonic sort on CPU and GPU
 */
int main(void) {
	cudaEvent_t start, stop;
	cudaEventCreate(&start);
	cudaEventCreate(&stop);

	int N = THREADS*BLOCKS;
	// check
	if (!(N && !(N & (N - 1)))) {
		printf("ERROR: N must be power of 2 (N = %d)\n", N);
		exit(1);
	}
	size_t nBytes = N * sizeof(int);
	int *a = (int*) malloc(nBytes);
	int *b = (int*) malloc(nBytes);

	// fill data
	for (int i = 0; i < N; ++i) {
		a[i] =  rand() % 100;
		b[i] = a[i];
	}

	// bitonic CPU
	double cpu_time = seconds();
	bitonicSort(b, 0, N, 1);   // 1 means sort in ascending order
	printf("CPU elapsed time: %.5f (sec)\n", seconds()-cpu_time);

	// device mem copy
	int *d_a;
	CHECK(cudaMalloc((void**) &d_a, nBytes));
	CHECK(cudaMemcpy(d_a, a, nBytes, cudaMemcpyHostToDevice));

	// num of threads
	dim3 blocks(BLOCKS, 1);   // Number of blocks
	dim3 threads(THREADS, 1); // Number of threads

	// start computation
	cudaEventRecord(start);
	int j, k;
	// external loop on comparators of size k
	for (k = 2; k <= N; k <<= 1) {
		// internal loop for comparator internal stages
		for (j = k >> 1; j > 0; j = j >> 1)
			bitonic_sort_step<<<blocks, threads>>>(d_a, j, k);
	}
	cudaEventRecord(stop);
	cudaEventSynchronize(stop);
	float milliseconds = 0;
	cudaEventElapsedTime(&milliseconds, start, stop);
	printf("GPU elapsed time: %.5f (sec)\n", milliseconds / 1000);

	// recover data
	cudaMemcpy(a, d_a, nBytes, cudaMemcpyDeviceToHost);

	// print & check
	if (N < 100) {
		printf("GPU:\n");
		for (int i = 0; i < N; ++i)
			printf("%d  ", a[i]);
		printf("\nCPU:\n");
		for (int i = 0; i < N; ++i)
			printf("%d  ", b[i]);
	}
	else {
		for (int i = 0; i < N; ++i) {
			if (a[i] != b[i]) {
				printf("ERROR a[%d] != b[%d]  (a[i] = %d  -  b[i] = %d\n", i,i, a[i],b[i]);
				break;
			}
		}
	}

	cudaFree(d_a);
	exit(0);
}

↘️ **TODO...**

In [None]:
%%cuda_group_save --name "bitonic.cu" --group "lab10"

#include <stdlib.h>
#include <stdio.h>
#include <time.h>

#include "../../GPUcomputing/utils/common.h"

#define THREADS 1024
#define BLOCKS 16*1024

__global__ void bitonic_sort_step(int *a, int j, int k) {

	// TODO

}

/*The parameter dir indicates the sorting direction, ASCENDING
 or DESCENDING; if (a[i] > a[j]) agrees with the direction,
 then a[i] and a[j] are interchanged.*/
void compAndSwap(int a[], int i, int j, int dir) {
	if (dir == (a[i] > a[j])) {
		int tmp = a[i];
		a[i] = a[j];
		a[j] = tmp;
	}
}

/*It recursively sorts a bitonic sequence in ascending order,
 if dir = 1, and in descending order otherwise (means dir=0).
 The sequence to be sorted starts at index position low,
 the parameter cnt is the number of elements to be sorted.*/
void bitonicMerge(int a[], int low, int cnt, int dir) {
	if (cnt > 1) {
		int k = cnt / 2;
		for (int i = low; i < low + k; i++)
			compAndSwap(a, i, i + k, dir);
		bitonicMerge(a, low, k, dir);
		bitonicMerge(a, low + k, k, dir);
	}
}

/* This function first produces a bitonic sequence by recursively
 sorting its two halves in opposite sorting orders, and then
 calls bitonicMerge to make them in the same order */
void bitonicSort(int a[], int low, int cnt, int dir) {
	if (cnt > 1) {
		int k = cnt / 2;

		// sort in ascending order since dir here is 1
		bitonicSort(a, low, k, 1);

		// sort in descending order since dir here is 0
		bitonicSort(a, low + k, k, 0);

		// Will merge wole sequence in ascending order
		// since dir=1.
		bitonicMerge(a, low, cnt, dir);
	}
}

/*
 * MAIN: test bitonic sort on CPU and GPU
 */
int main(void) {
	cudaEvent_t start, stop;
	cudaEventCreate(&start);
	cudaEventCreate(&stop);

	int N = THREADS*BLOCKS;
	// check
	if (!(N && !(N & (N - 1)))) {
		printf("ERROR: N must be power of 2 (N = %d)\n", N);
		exit(1);
	}
	size_t nBytes = N * sizeof(int);
	int *a = (int*) malloc(nBytes);
	int *b = (int*) malloc(nBytes);

	// fill data
	for (int i = 0; i < N; ++i) {
		a[i] =  rand() % 100;
		b[i] = a[i];
	}

	// bitonic CPU
	double cpu_time = seconds();
	bitonicSort(b, 0, N, 1);   // 1 means sort in ascending order
	printf("CPU elapsed time: %.5f (sec)\n", seconds()-cpu_time);

	// device mem copy
	int *d_a;
	CHECK(cudaMalloc((void**) &d_a, nBytes));
	CHECK(cudaMemcpy(d_a, a, nBytes, cudaMemcpyHostToDevice));

	// num of threads
	dim3 blocks(BLOCKS, 1);   // Number of blocks
	dim3 threads(THREADS, 1); // Number of threads

	// start computation on GPU
	cudaEventRecord(start);
	int j, k;

	  // TODO

	cudaEventRecord(stop);
	cudaEventSynchronize(stop);
	float milliseconds = 0;
	cudaEventElapsedTime(&milliseconds, start, stop);
	printf("GPU elapsed time: %.5f (sec)\n", milliseconds / 1000);

	// recover data
	cudaMemcpy(a, d_a, nBytes, cudaMemcpyDeviceToHost);

	// print & check
	if (N < 100) {
		printf("GPU:\n");
		for (int i = 0; i < N; ++i)
			printf("%d  ", a[i]);
		printf("\nCPU:\n");
		for (int i = 0; i < N; ++i)
			printf("%d  ", b[i]);
	}
	else {
		for (int i = 0; i < N; ++i) {
			if (a[i] != b[i]) {
				printf("ERROR a[%d] != b[%d]  (a[i] = %d  -  b[i] = %d\n", i,i, a[i],b[i]);
				break;
			}
		}
	}

	cudaFree(d_a);
	exit(0);
}

↩ **Run...**

In [None]:
!nvcc -arch=sm_75 src/lab10/bitonic.cu -o bitonic
!./bitonic