---
# **LAB 9 - Parallel patterns**
---

# ▶️ CUDA setup

In [None]:
!nvcc --version

In [None]:
!nvidia-smi

GPU computing notebooks download (from github)

In [None]:
!git clone https://github.com/giulianogrossi/GPUcomputing.git

NVCC Plugin for Jupyter notebook

In [None]:
%cd GPUcomputing/utils/nvcc4jupyter-master/
!python3 -m build
%load_ext nvcc4jupyter
%cd /content/

# ▶️ DeviceQuery

In [None]:
# DeviceQuery dell'attuale device (su Colab!)
!nvcc -arch=sm_75 /content/GPUcomputing/utils/deviceQuery.cu -o deviceQuery
!./deviceQuery

# ✅ BFS

In [None]:
%%cuda_group_save --name "testBFS.cu" --group "ALG"

#include <stdio.h>
#include <stdlib.h>
#include "../../GPUcomputing/utils/graph/graph.h"
#include "../../GPUcomputing/utils/common.h"

__global__ void print_d(GraphStruct*, bool);

__global__ void initBFS(GraphStruct *G, bool *Fa, bool *Xa, unsigned int n) {
	int nodeID = threadIdx.x + blockIdx.x * blockDim.x;

	if (nodeID > n)
		return;

  // set Fa and Xa vectors to false
  Fa[nodeID] = false;
  Xa[nodeID] = false;

}

/**
 * Kernel: The BFS frontier corresponds to all the nodes
 *         being processed at the current level
 */
__global__ void cudaBFS(GraphStruct *G, bool *Fa, bool *Xa, int *Ca, bool *done, int n) {

	int nodeID = threadIdx.x + blockIdx.x * blockDim.x;   // node ID

	if (nodeID > n)
		return;

	if (Fa[nodeID]) {
		*done = false;
		Fa[nodeID] = false;
		Xa[nodeID] = true;
		int deg = G->cumDegs[nodeID + 1] - G->cumDegs[nodeID];
		int start = G->cumDegs[nodeID];
		for (int i = 0; i < deg; i++) {
			int neighID = G->neighs[start + i];
			if ( !Xa[neighID] ) {
				Ca[neighID] = Ca[nodeID] + 1;
				Fa[neighID] = true;
			}
		}
	}
}

/**
 * MAIN: BFS test both CPU & GPU
 */
int main() {

	int BLOCK_SIZE = 512;
	unsigned int N = 20000;                // number of nodes for random graphs
	float prob = .001;                     // density (percentage) for random graphs
	std::default_random_engine eng{0};   // fixed seed
	bool GPUEnabled = 1;
	Graph graph(N, GPUEnabled);

	// generate a random graph
	graph.randGraph(prob, eng);

	printf("** Graph done! \n");

	// get the graph struct
	GraphStruct *G = graph.getStruct();
	//print_d<<<1,1>>>(G, 1);

	// setup vars for BFS
	bool *Fa, *Va, *Xa;
	int *Ca;
	CHECK(cudaMallocManaged((void **)&Fa, N* sizeof(bool)));
	CHECK(cudaMallocManaged((void **)&Va, N* sizeof(bool)));
	CHECK(cudaMallocManaged((void **)&Xa, N* sizeof(bool)));
	CHECK(cudaMallocManaged((void **)&Ca, N* sizeof(int)));

	// set the source
	int source = 0;
	Fa[source] = true;

	bool done;
	bool *d_done;
	cudaMalloc((void **)&d_done, sizeof(bool));
	int count = 0;
	printf("** BFS: graph size N = %d, probability = %f\n", N, prob);

	// events to measure time
	cudaEvent_t start, stop;
	cudaEventCreate(&start);
	cudaEventCreate(&stop);

	int nThreads = N;
	dim3 block(min(nThreads, BLOCK_SIZE));
	dim3 grid((nThreads + block.x - 1) / block.x);
	cudaEventRecord(start);
	initBFS<<<grid, block>>>(G, Fa, Xa, N);
	CHECK(cudaDeviceSynchronize());
	Fa[0] = true;
	Ca[0] = 0;
	do {
		count++;
		done = true;
		cudaMemcpy(d_done, &done, sizeof(bool), cudaMemcpyHostToDevice);
		cudaBFS<<< grid, block >>>(G, Fa, Xa, Ca, d_done, N);
		cudaMemcpy(&done, d_done, sizeof(bool), cudaMemcpyDeviceToHost);
	} while (!done);
	CHECK(cudaDeviceSynchronize());
	CHECK(cudaEventRecord(stop));
	CHECK(cudaEventSynchronize(stop));
	float milliseconds;
	cudaEventElapsedTime(&milliseconds, start, stop);
	float GPUtime = milliseconds / 1000.0;
	printf("   elapsed time:   %.5f (sec)\n", GPUtime);
	printf("   Number of times the kernel is called : %d \n", count);
	printf("   Average deg: = %f\n", (float)G->cumDegs[N-1]/N);

	int max_Ca = 0;
	for (int i = 0; i < N; i++) {
		if (Ca[i] > max_Ca)
			max_Ca = Ca[i];
	}
	printf("   Cost: max Ca = %d\n", max_Ca);

	return 0;
}

In [None]:
# Compilazione ed esecuzione
!nvcc -arch=sm_75  src/ALG/testBFS.cu GPUcomputing/utils/graph/graph.cpp GPUcomputing/utils/graph/graph_d.cu -o testBFS
!./testBFS

# ✅ Luby coloring


In [None]:
%%cuda_group_save --name "coloring.h" --group "ALG"

#pragma once

#include <curand_kernel.h>
#include "../../GPUcomputing/utils/graph/graph.h"
#include "../../GPUcomputing/utils/common.h"

/**
 *  graph coloring struct (colors are: 1,2,3,..,k)
 */

struct Coloring {
	bool		uncoloredNodes;
	uint		numOfColors;
	uint	*	coloring;   // each element denotes a color
};

Coloring* LubyGreedy(GraphStruct*);
void printColoring (Coloring*, GraphStruct*, bool);
__global__ void LubyJPcolorer (Coloring*, GraphStruct*, uint*) ;
__global__ void init(uint seed, curandState_t*, uint*, uint);
__global__ void findIS (Coloring*, GraphStruct*, uint*);
__global__ void print_d(GraphStruct*, bool);

In [None]:
%%cuda_group_save --name "luby.cu" --group "ALG"

#include <iostream>
#include "coloring.h"
#include "../../GPUcomputing/utils/graph/graph_d.h"
#include "../../GPUcomputing/utils/common.h"

using namespace std;

#define THREADxBLOCK 128

Coloring* LubyGreedy(GraphStruct *str) {
	// set coloring struct

	Coloring* col;
	CHECK(cudaMallocManaged(&col, sizeof(Coloring)));
	uint n = str->nodeSize;
	col->uncoloredNodes = true;

	// cudaMalloc for arrays of struct Coloring
	CHECK(cudaMallocManaged( &(col->coloring), n * sizeof(uint)));
	memset(col->coloring,0,n);

	// allocate space on the GPU for the random states
	curandState_t* states;
	uint* weigths;
	cudaMalloc((void**) &states, n * sizeof(curandState_t));
	cudaMalloc((void**) &weigths, n * sizeof(uint));
	dim3 threads ( THREADxBLOCK);
	dim3 blocks ((str->nodeSize + threads.x - 1) / threads.x, 1, 1 );
	uint seed = 0;
	init <<< blocks, threads >>> (seed, states, weigths, n);

	// loop on ISs covering the graph
	col->numOfColors = 0;
	while (col->uncoloredNodes) {
		col->uncoloredNodes = false;
		col->numOfColors++;
		findIS <<< blocks, threads >>> (col, str, weigths);
		cudaDeviceSynchronize();
	}

	cudaFree(states);
	cudaFree(weigths);
	return col;
}

/**
 * find an IS
 */
__global__ void findIS (Coloring* col, GraphStruct *str, uint* weights) {
	uint idx = threadIdx.x + blockDim.x * blockIdx.x;

	if (idx >= str->nodeSize)
		return;

	if (col->coloring[idx])
		return;

	uint offset = str->cumDegs[idx];
	uint deg = str->cumDegs[idx + 1] - str->cumDegs[idx];

	bool candidate = true;
	for (uint j = 0; j < deg; j++) {
		uint neighID = str->neighs[offset + j];
		if (!col->coloring[neighID] &&
				((weights[idx] < weights[neighID]) ||
				((weights[idx] == weights[neighID]) && idx < neighID))) {
			candidate = false;
		}
	}
	if (candidate) {
		col->coloring[idx] = col->numOfColors;
	}
	else
		col->uncoloredNodes = true;
}


/**
 *  this GPU kernel takes an array of states, and an array of ints, and puts a random int into each
 */
__global__ void init (uint seed, curandState_t* states, uint* numbers, uint n) {
	uint idx = blockIdx.x * blockDim.x + threadIdx.x;
	if (idx > n)
			return;
	curand_init(seed, idx, 0, &states[idx]);
	numbers[idx] = curand(&states[idx])%n*n;
}

/**
 * Print the graph (verbose = 1 for "verbose print")
 * @param verbose print the complete graph
 */
void printColoring (Coloring* col, GraphStruct* str, bool verbose) {
	node n = str->nodeSize;
	cout << "** Graph (num node: " << n << ", num edges: " << str->edgeSize << ")" << endl;
	cout << "** Coloring (num colors: " << col->numOfColors << ")" << endl;
	if (verbose) {
		for (int i = 1; i <= col->numOfColors; i++) {
			cout << "   color(" << i << ")" << "-> ";
			for (int j = 0; j < n; j++)
				if (col->coloring[j] == i)
					cout << j << " ";
			cout << "\n";
		}
		cout << "\n";
	}
}


### TEST on Luby


Graph print layout:
```
** Graph (num node: 10, num edges: 46)
      (min deg: 3, max deg: 7, mean deg: 4.6, connected: 1)
      node(0)[5]-> 1 2 3 7 9
      node(1)[3]-> 0 2 9
      node(2)[5]-> 0 1 3 5 8
      node(3)[7]-> 0 2 4 5 6 7 9
      node(4)[5]-> 3 6 7 8 9
      node(5)[5]-> 2 3 6 8 9
      node(6)[4]-> 3 4 5 8
      node(7)[3]-> 0 3 4
      node(8)[4]-> 2 4 5 6
      node(9)[5]-> 0 1 3 4 5
```


Coloring print layout:
```
** Graph (num node: 10, num edges: 36)
** Coloring (num colors: 6)
    color(1)-> 1 3 8
    color(2)-> 0 6
    color(3)-> 4 7
    color(4)-> 9
    color(5)-> 5
    color(6)-> 2
```




In [None]:
%%cuda_group_save --name "test_Luby.cu" --group "ALG"

#include "coloring.h"

int main(void) {
	unsigned int n = 10;		 // number of nodes for random graphs
	float prob = .5;				    // density (percentage) for random graphs
	std::default_random_engine eng{0};  // fixed seed

	// new graph with n nodes
	Graph graph(n,1);

	// generate a random graph
	graph.randGraph(prob,eng);


	// get the graph struct
	GraphStruct *str = graph.getStruct();

	// print small graph
	if (n <= 128) {
		graph.print(true);  // CPU print
		print_d<<< 1, 1 >>>(str, true);  // GPU print
	}


	// GPU Luby-JP greedy coloring
	Coloring* col = LubyGreedy(str);
	cudaDeviceSynchronize();
	//printColoring(col, str, 1);

	return EXIT_SUCCESS;
}


In [None]:
# Compilazione ed esecuzione

!nvcc -arch=sm_75 src/ALG/test_Luby.cu src/ALG/luby.cu GPUcomputing/utils/graph/graph.cpp GPUcomputing/utils/graph/graph_d.cu -o testLuby


In [None]:
!./testLuby

# ✅ PageRank (Google)

In [None]:
%%cuda_group_save --name "pagerank.cu" --group "ALG"

#include <stdio.h>
#include <bits/stdc++.h>
#include <cuda.h>
#include <thrust/reduce.h>
#include <thrust/sort.h>
#include <thrust/execution_policy.h>

using namespace std;


void get_adj_matrix(float* graph, int n, float d, FILE *inputFilePtr ){

   int m, indexing;

   fscanf(inputFilePtr, "%d", &m);
   fscanf(inputFilePtr, "%d", &indexing);

   for(int i = 0; i< n ; i++)
      for(int j = 0; j< n; ++j)
         graph[i* n + j] = (1 - d)/float(n);

   while(m--){
      int source, destin;
      fscanf(inputFilePtr, "%d", &source);
      fscanf(inputFilePtr, "%d", &destin);
      if (indexing == 0)
         graph[destin* n + source] += d* 1.0  ;
      else
         graph[(destin - 1)* n + source - 1] += d;
   }
}

__global__ void manage_adj_matrix(float* gpu_graph, int n){
   int id = blockIdx.x * blockDim.x + threadIdx.x;
   if(id >= n) return;

   float sum = 0.0;

   for (int i = 0; i < n; ++i)
      sum += gpu_graph[i* n + id];

   for (int i = 0; i < n; ++i){
      if (sum != 0.0)
            gpu_graph[i* n + id] /= sum;
      else
            gpu_graph[i* n + id] = (1/(float)n);
   }
}

__global__ void initialize_rank(float* gpu_r, int n){
   int id = blockIdx.x * blockDim.x + threadIdx.x;

   if(id < n)
      gpu_r[id] = (1/(float)n);
}

__global__ void store_rank(float* gpu_r,float* gpu_r_last, int n){
   int id = blockIdx.x * blockDim.x + threadIdx.x;

   if(id < n)
      gpu_r_last[id] = gpu_r[id];
}

__global__ void matmul(float* gpu_graph, float* gpu_r, float* gpu_r_last, int n){
    int id = blockIdx.x * blockDim.x + threadIdx.x;

    if(id < n) {
        float sum = 0.0;
        for (int j = 0; j< n; ++j)
            sum += gpu_r_last[j] * gpu_graph[id* n + j];
        gpu_r[id] = sum;
    }
}

__global__ void rank_diff(float* gpu_r,float* gpu_r_last, int n){
   int id = blockIdx.x * blockDim.x + threadIdx.x;

   if(id < n)
      gpu_r_last[id] = abs(gpu_r_last[id] - gpu_r[id]);
}

__global__ void init_pair_array(pair<float, int>* gpu_r_nodes, float * gpu_r, int n){
   int id = blockIdx.x * blockDim.x + threadIdx.x;

   if(id < n) {
      gpu_r_nodes[id].first = gpu_r[id];
      gpu_r_nodes[id].second = id + 1;
   }
}


void power_method(float *gpu_graph, float *r, int n, int nblocks, int BLOCKSIZE, int max_iter = 1000, float eps = 0.000001 ){
   float* r_last = (float*) malloc(n * sizeof(float));
   float* gpu_r;
   cudaMalloc(&gpu_r, sizeof(float)*n);
   float* gpu_r_last;
   cudaMalloc(&gpu_r_last, sizeof(float)*n);

   initialize_rank<<< nblocks, BLOCKSIZE >>>(gpu_r, n);

   while(max_iter--){
      store_rank<<< nblocks, BLOCKSIZE >>>(gpu_r, gpu_r_last, n);
      matmul<<< nblocks, BLOCKSIZE >>>(gpu_graph, gpu_r, gpu_r_last, n);
      rank_diff<<< nblocks, BLOCKSIZE >>>(gpu_r, gpu_r_last, n);
      cudaMemcpy(r_last, gpu_r_last, n* sizeof(float), cudaMemcpyDeviceToHost);
      float result = thrust::reduce(r_last, r_last + n);

      if(result < eps)
         return;
   }
}

void top_nodes(float* r, int n, int nblocks, int BLOCKSIZE, int count = 10) {

   pair<float, int> *r_nodes = (pair<float, int> *) malloc ( n * sizeof (pair<float, int>) );
   pair<float, int> *gpu_r_nodes;

   cudaMalloc(&gpu_r_nodes, n * sizeof (pair<float, int>));

   float* gpu_r;
   cudaMalloc(&gpu_r, sizeof(float)*n);
   cudaMemcpy(gpu_r, r, sizeof(float)*n, cudaMemcpyHostToDevice);

   init_pair_array<<<nblocks, BLOCKSIZE>>>(gpu_r_nodes, gpu_r, n);

   cudaMemcpy(r_nodes, gpu_r_nodes, n * sizeof (pair<float, int>), cudaMemcpyDeviceToHost);

   //sort(host, r_nodes, r_nodes + n);

   int rank =1;
   while(rank <= count){
      printf("Rank %d Node is %d\n", rank, r_nodes[n - rank].second);
      rank++;
   }
}

int main(int argc, char** argv) {

  clock_t start, end;
  FILE* inputFilePtr;
  char* bsize = argv[2];
  int BLOCKSIZE = atoi(bsize);
  int n;


  inputFilePtr = fopen(argv[1], "r");
  fscanf(inputFilePtr, "%d", &n);
  int nblocks = ceil(float(n) / BLOCKSIZE);
  float* graph = (float*)malloc(n*n*sizeof(float));
  float* r = (float*) malloc(n * sizeof(float));

  float d = 0.85;

  get_adj_matrix(graph, n, d, inputFilePtr);

  float* gpu_graph;
  cudaMalloc(&gpu_graph, sizeof(float)*n*n);
  cudaMemcpy(gpu_graph, graph, sizeof(float)*n*n, cudaMemcpyHostToDevice);

  start = clock();
  manage_adj_matrix<<< nblocks, BLOCKSIZE >>>(gpu_graph, n);
  power_method(gpu_graph, r, n, nblocks, BLOCKSIZE);
  top_nodes(r, n, nblocks, BLOCKSIZE);
  end = clock();

  printf("Time taken :%f for parallel implementation with %d nodes.\n", float(end - start), n);
  return 0;
}

In [None]:
!nvcc -arch=sm_75 src/ALG/pagerank.cu -o pr
!./pr GPUcomputing/lab9/1000.txt 128