# Part 1: Implement BFS - Sequentially


In [29]:
%%writefile sequential.cu


#include <time.h>
#include <stdio.h>
#include <stdlib.h>

int gate_solver(int nodeInput, int nodeOutput, int nodeGate) {
    
    if(nodeGate == 0){
         if (nodeInput == 1 && nodeOutput == 1){return 1;}
         else {return 0;} 
    }
    if(nodeGate == 1){
        if (nodeInput == 0 && nodeOutput == 0) {return 0;}
        else {return 1;}    
    }

    if(nodeGate == 2){
        if (nodeInput == 1 && nodeOutput == 1) {return 0;}
        else {return 1;}    
    }
    
    if(nodeGate == 3){
        if (nodeInput == 0 && nodeOutput == 0) {return 1;}
        else {return 0;}    
    }

    if(nodeGate == 4){
        if (nodeInput == nodeOutput) {return 0;}
        else {return 1;}    
    }

    if(nodeGate == 5){
        if (nodeInput == nodeOutput) {return 1;}
        else {return 0;}    
    }
     
}

// Reads the third input file
int read_third_file(char* path_inputfile, int** first_input, int** second_input, int** third_input, int** fourth_input) {
    int size_file;
    int read_value_1; int read_value_2; int read_value_3; int read_value_4;
    FILE* read_this;
    
    //File Open
    if ((read_this = fopen(path_inputfile, "r")) == NULL){
        printf("Error: Cannot Open File!");
        exit(1);
    } 

    //Allocating memory for input variables
    fscanf(read_this, "%d", &size_file);
    *first_input = (int*) malloc(size_file * sizeof(int)); *second_input = (int*) malloc(size_file * sizeof(int));
    *third_input = (int*) malloc(size_file * sizeof(int)); *fourth_input = (int*) malloc(size_file * sizeof(int));

    //Iterating through all four read values
    for (int i = 0; i < size_file; i++) {
        fscanf(read_this, "%d, %d, %d, %d", &read_value_1, &read_value_2, &read_value_3, &read_value_4);
        //Assigning read values to corresponding input variables
        (*first_input)[i] = read_value_1; (*second_input)[i] = read_value_2;
        (*third_input)[i] = read_value_3; (*fourth_input)[i] = read_value_4;
    }

    fclose(read_this);
    return size_file;

}

// Reads in the passed input file
int read_input(int** input_vals, char* path_inputfile) {
    int size_file;
    int read_value;

    FILE* read_this;
    
    //File Open
    if ((read_this = fopen(path_inputfile, "r")) == NULL){
        printf("Error: Cannot Open File!");
        exit(1);
    } 

    fscanf(read_this, "%d", &size_file);
    *input_vals = (int*) malloc(size_file * sizeof(int));

    //Reading input file values
    for (int i = 0; i < size_file; i++) {
        fscanf(read_this, "%d", &read_value);
        (*input_vals)[i] = read_value;
    }

    fclose(read_this);
    return size_file;
}

int main(int argc, char* argv[]) {  
    // User Input
    char* input_1 = argv[1]; char* input_2 = argv[2]; char* input_3 = argv[3]; char* input_4 = argv[4];
    char* output_node = argv[5]; char* output_nln = argv[6];

    // Declaring Variables
    
    //Node variables
    int countNodes = 0; int* nodePtrs; int numNodePtrs = 0;
    int* nodeInput; int* nodeGate; int* nodeOutput;
    
    //Node Neighbours variables
    int* nodeNeighbors; int countNodeNeighbors = 0; int* nodeVisited; 
    
    //Node level variables
    int* currLevelNodes; int numCurrLevelNodes = 0; //Current level
    int* nextLevelNodes; int numNextLevelNodes = 0; //Next level

    //Argument Check
    if (argc != 7) {return fprintf(stderr, "Not Enough or Too Many Arguments!\n");}

    //Assigning values for Node variables
    countNodes = read_third_file(input_3, &nodeVisited, &nodeGate, &nodeInput, &nodeOutput);
    numNodePtrs = read_input(&nodePtrs, input_1);
    countNodeNeighbors = read_input(&nodeNeighbors, input_2);
    
    numCurrLevelNodes = read_input(&currLevelNodes, input_4);
    nextLevelNodes = (int*) malloc(countNodes * sizeof(int));

    //BFS Loop
    clock_t begin_timer = clock();
    // Loop over all nodes in the current level 
    for (int i = 0; i < numCurrLevelNodes; i++) {
        int node = currLevelNodes[i];
        
        // Loop over all neighbors of the node
        for (int j = nodePtrs[node]; j < nodePtrs[node+1]; j++) {
            int neighbor = nodeNeighbors[j];
            
            // If the neighbor hasn't been visited yet 
            if (!nodeVisited[neighbor]) {
                
                // Mark it and add it to the queue 
                nodeVisited[neighbor] = 1;
                nodeOutput[neighbor] = gate_solver(nodeInput[neighbor], nodeOutput[node], nodeGate[neighbor]);
                nextLevelNodes[numNextLevelNodes] = neighbor;
                ++numNextLevelNodes;
            }
        }
    }
    clock_t stop_timer = clock();
    
    // Opening output files
    FILE* outfile_node = fopen(output_node, "w"); FILE* outfile_nln = fopen(output_nln, "w");

    // File Check
    if(!outfile_node || !outfile_nln){
        return fprintf(stderr, "Invalid Output Files");
    } 

    //Writing values to output_nodeOutput
    fprintf(outfile_node, "%d\n", countNodes);
    for (int i = 0; i < countNodes; i++) { fprintf(outfile_node, "%d\n", nodeOutput[i]); }
    fclose(outfile_node);

    //Writing values to output_nextLevelNodes
    fprintf(outfile_nln, "%d\n", numNextLevelNodes);
    for (int i = 0; i < numNextLevelNodes; i++) { fprintf(outfile_nln, "%d\n", nextLevelNodes[i]); }
    fclose(outfile_nln);
    
    //Printing Runtime
    printf("Sequential time: %f ms\n", (double) (stop_timer - begin_timer) / CLOCKS_PER_SEC * 1000);

}



Overwriting sequential.cu


### Compile Source File - sequential.c

In [30]:
!gcc sequential.c -o sequential
!./sequential input1.raw input2.raw input3.raw input4.raw output_nodeOutput.raw output_nextLevelNodes.raw

Runtime time: 2.238000 ms


### Compile Compare Programs

In [31]:
!gcc compareNodeOutput.c -o compareNodeOutput
!gcc compareNextLevelNodes.c -o compareNextLevelNodes

### Compare Output Files

In [32]:
!./compareNodeOutput output_nodeOutput.raw sol_nodeOutput.raw
!./compareNextLevelNodes output_nextLevelNodes.raw sol_nextLevelNodes.raw

Total Errors : 0	No errors!


# Implement BFS - Parallelize

### Add CUDA version of compare programs

In [33]:
%%writefile cuda_compareNodeOutput.cu
#include <stdio.h>
#include <stdlib.h>


void compareFiles(char *file_name1, char *file_name2) 
{ 
//get from https://www.tutorialspoint.com/c-program-to-compare-two-files-and-report-mismatches
FILE* fp1 = fopen(file_name1, "r");
FILE* fp2 = fopen(file_name2, "r");
    // fetching character of two file 
    // in two variable ch1 and ch2 
    char ch1 = getc(fp1); 
    char ch2 = getc(fp2); 
  
    // error keeps track of number of errors 
    // pos keeps track of position of errors 
    // line keeps track of error line 
    int error = 0, pos = 0, line = 1; 
  
    // iterate loop till end of file 
    while (ch1 != EOF && ch2 != EOF) 
    { 
        pos++; 
  
        // if both variable encounters new 
        // line then line variable is incremented 
        // and pos variable is set to 0 
        if (ch1 == '\n' && ch2 == '\n') 
        { 
            line++; 
            pos = 0; 
        } 
  
        // if fetched data is not equal then 
        // error is incremented 
        if (ch1 != ch2) 
        { 
            error++; 
            printf("Line Number : %d \tError"
               " Position : %d \n", line, pos); 
        } 
  
        // fetching character until end of file 
        ch1 = getc(fp1); 
        ch2 = getc(fp2); 
    } 
  
    printf("Total Errors : %d\t", error); 
} 

int main(int argc, char *argv[]){

    if( argc < 3) {
      printf("Require two files\n");
      exit(1);
      
   }
compareFiles(argv[1], argv[2]);
}


Overwriting cuda_compareNodeOutput.cu


In [34]:
%%writefile cuda_compareNextLevelNodes.cu
#include <stdio.h>
#include <stdlib.h>
void sort(int *pointer, int size){
  //get from https://stackoverflow.com/questions/13012594/sorting-with-pointers-instead-of-indexes
    int *i, *j, temp;
    for(i = pointer; i < pointer + size; i++){
        for(j = i + 1; j < pointer + size; j++){
            if(*j < *i){
                temp = *j;
                *j = *i;
                *i = temp;
            }
        }
    }
}

void compareNextLevelNodeFiles(char *file_name1, char *file_name2) 
{ 

  
    FILE* fp_1 = fopen(file_name1, "r");
    if (fp_1 == NULL){
     fprintf(stderr, "Couldn't open file for reading\n");
     exit(1);
    } 

    FILE* fp_2 = fopen(file_name2, "r");
    if (fp_2 == NULL){
     fprintf(stderr, "Couldn't open file for reading\n");
     exit(1);
    } 
    
    int counter = 0;
    int len_1;
    int len_2;
    int length_file_1 = fscanf(fp_1, "%d", &len_1);
    int length_file_2 = fscanf(fp_2, "%d", &len_2);

    if(length_file_1 != length_file_2){
      fprintf(stderr, "Wrong file length\n");
      exit(1);
    }
    int *input1 = (int *)malloc(len_1 * sizeof(int));
    int *input2 = (int *)malloc(len_2 * sizeof(int));




    int temp1;
    int temp2;

    while ((fscanf(fp_1, "%d", &temp1) == 1) && (fscanf(fp_2, "%d", &temp2) == 1)) {
        (input1)[counter] = temp1;
        (input2)[counter] = temp2;
        counter++;
    }

    sort(input1, len_1);
    sort(input2, len_2);

    for(int i=0; i< len_1; i++){
      if(input1[i] != input2[i]){
        fprintf(stderr, "Something goes wrong\n");
        exit(1);
      }
    }

    fprintf(stderr, "No errors!\n");
        exit(1);

} 

int main(int argc, char *argv[]){

    if( argc < 3) {
      printf("Require two files\n");
      exit(1);
      
   }
compareNextLevelNodeFiles(argv[1], argv[2]);
}


Overwriting cuda_compareNextLevelNodes.cu


### Compile Compare Programs

In [35]:
!nvcc cuda_compareNodeOutput.cu -o compareNodeOutput
!nvcc cuda_compareNextLevelNodes.cu -o compareNextLevelNodes

## Part 2: Global Queuing

In [39]:
%%writefile global_queuing.cu

#ifndef CUDACC
#define CUDACC
#endif

#include <time.h>
#include <stdio.h>
#include <stdlib.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <cuda.h>
#include <device_functions.h>
#include <cuda_runtime_api.h>
#define AND 0
#define OR 1
#define NAND 2
#define NOR 3
#define XOR 4
#define XNOR 5

// Reads the third input file
int read_third_file(char* path_inputfile, int** first_input, int** second_input, int** third_input, int** fourth_input) {
    int size_file;
    int read_value_1; int read_value_2; int read_value_3; int read_value_4;
    FILE* read_this;
    
    //File Open
    if ((read_this = fopen(path_inputfile, "r")) == NULL){
        printf("Error: Cannot Open File!");
        exit(1);
    } 

    //Allocating memory for input variables
    fscanf(read_this, "%d", &size_file);
    *first_input = (int*) malloc(size_file * sizeof(int)); *second_input = (int*) malloc(size_file * sizeof(int));
    *third_input = (int*) malloc(size_file * sizeof(int)); *fourth_input = (int*) malloc(size_file * sizeof(int));

    //Iterating through all four read values
    for (int i = 0; i < size_file; i++) {
        fscanf(read_this, "%d, %d, %d, %d", &read_value_1, &read_value_2, &read_value_3, &read_value_4);
        //Assigning read values to corresponding input variables
        (*first_input)[i] = read_value_1; (*second_input)[i] = read_value_2;
        (*third_input)[i] = read_value_3; (*fourth_input)[i] = read_value_4;
    }

    fclose(read_this);
    return size_file;

}

// Reads in the passed input file
int read_input(int** input_vals, char* path_inputfile) {
    int size_file;
    int read_value;

    FILE* read_this;
    
    //File Open
    if ((read_this = fopen(path_inputfile, "r")) == NULL){
        printf("Error: Cannot Open File!");
        exit(1);
    } 

    fscanf(read_this, "%d", &size_file);
    *input_vals = (int*) malloc(size_file * sizeof(int));

    //Reading input file values
    for (int i = 0; i < size_file; i++) {
        fscanf(read_this, "%d", &read_value);
        (*input_vals)[i] = read_value;
    }

    fclose(read_this);
    return size_file;
}

/*
helper device function to solve a given logic gate and inputs
*/
__device__ int gate_solver(int gate, int x1, int x2) {
    switch (gate) {
    case AND:
        return x1 && x2;
    case OR:
        return x1 || x2;
    case NAND:
        return !(x1 && x2);
    case NOR:
        return !(x1 || x2);
    case XOR:
        return (x1 || x2) && !(x1 && x2);
    case XNOR:
        return (x1 && x2) || (!x1 && !x2);
    }
}


__device__ int globalQueue[7000000];
__device__ int numNextLevelNodes = 0;

__global__ void global_queuing_kernel(int totalThreads, int countNodes, int* nodePtrs, int* currLevelNodes, int* nodeNeighbors, int* nodeVisited, int* nodeGate, int* nodeInput, int* nodeOutput) {
    
    int nodesPerThread = countNodes / totalThreads;
    int threadIndex = threadIdx.x + (blockDim.x * blockIdx.x);
    int beginIdx = threadIndex * nodesPerThread;
    
    //Loop over all nodes in the current level
    for (int id = beginIdx; id < countNodes && id < beginIdx + nodesPerThread; id++) {
        int nodeIdx = currLevelNodes[id];
        //Loop over all neighbors of the node
        for (int secondId = nodePtrs[nodeIdx]; secondId < nodePtrs[nodeIdx+1]; secondId++) {   
            int neighborIdx = nodeNeighbors[secondId];
            //If the neighbor hasn’t been visited yet
            const int visited = atomicExch(&(nodeVisited[neighborIdx]),1);
            if (!visited) {
                
                nodeOutput[neighborIdx] = gate_solver(nodeGate[neighborIdx], nodeOutput[nodeIdx], nodeInput[neighborIdx]);
                
                //Add it to the global queue
                const int globalQueueIdx = atomicAdd(&numNextLevelNodes,1); 
                globalQueue[globalQueueIdx] = neighborIdx; 
            }    
        }
         __syncthreads();
    }
}



int main(int argc, char *argv[]){

    // ~~~~~~~~~~~~~~~~~~~~~~~
    // step 1: parse arguments
    // ~~~~~~~~~~~~~~~~~~~~~~~

    // User Input
    char* input_1 = argv[1]; char* input_2 = argv[2]; char* input_3 = argv[3]; char* input_4 = argv[4];
    char* output_node = argv[5]; char* output_nln = argv[6];
    
    // Declaring Variables
    
    //Node variables
    int countNodes = 0; int* nodePtrs; int numNodePtrs = 0;
    int* nodeInput; int* nodeGate; int* nodeOutput;
    
    //Node Neighbours variables
    int* nodeNeighbors; int countNodeNeighbors = 0; int* nodeVisited; 
    
    //Node level variables
    int* currLevelNodes; int numCurrLevelNodes = 0;  //Current level
    int* nextLevelNodes; int numNextLevelNodes = 0; //Next level

    //Argument Check
    if (argc != 7) {return fprintf(stderr, "Not Enough or Too Many Arguments!\n");}
    
    // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    // step 2: read in inputs from file
    // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    countNodes = read_third_file(input_3, &nodeVisited, &nodeGate, &nodeInput, &nodeOutput);
    numNodePtrs = read_input(&nodePtrs, input_1);
    countNodeNeighbors = read_input(&nodeNeighbors, input_2);
    
    numCurrLevelNodes = read_input(&currLevelNodes, input_4);
    nextLevelNodes = (int *) malloc(countNodes * sizeof(int));
    
    // ~~~~~~~~~~~~~~~~~~~~~~~~
    // step 3: allocate for GPU
    // ~~~~~~~~~~~~~~~~~~~~~~~~

    cudaMemcpyToSymbol(globalQueue,nextLevelNodes, countNodes * sizeof(int));
    
    int countNodesSize = countNodes * sizeof(int);
    int numBlocks = 35;
    int blockSize = 128;
    int* nodePtrs_cuda = (int*)malloc( numNodePtrs * sizeof(int)) ; 
    int* currLevelNodes_cuda = (int*)malloc( numCurrLevelNodes * sizeof(int)) ; 
    int* nodeNeighbors_cuda = (int*)malloc( countNodeNeighbors * sizeof(int)) ; 
    int* nodeVisited_cuda = (int*)malloc( countNodesSize) ; 
    int* nodeGate_cuda = (int*)malloc( countNodesSize) ; 
    int* nodeInput_cuda = (int*)malloc( countNodesSize) ; 
    int* nodeOutput_cuda = (int*)malloc(countNodesSize) ; 
    
    // Calling CUDA Functions

    // CUDA: Accessing nodePtrs_cuda
    cudaMalloc (&nodePtrs_cuda, numNodePtrs * sizeof(int));
    cudaMemcpy(nodePtrs_cuda, nodePtrs, numNodePtrs * sizeof(int), cudaMemcpyHostToDevice);
    // CUDA: Accessing currLevelNodes_cuda
    cudaMalloc (&currLevelNodes_cuda, numCurrLevelNodes * sizeof(int));
    cudaMemcpy(currLevelNodes_cuda, currLevelNodes, numCurrLevelNodes * sizeof(int), cudaMemcpyHostToDevice);
    // CUDA: Accessing nodeNeighbors_cuda
    cudaMalloc (&nodeNeighbors_cuda, countNodeNeighbors * sizeof(int));
    cudaMemcpy(nodeNeighbors_cuda, nodeNeighbors, countNodeNeighbors * sizeof(int), cudaMemcpyHostToDevice);
    // CUDA: Accessing nodeVisited_cuda
    cudaMalloc (&nodeVisited_cuda, countNodesSize);
    cudaMemcpy(nodeVisited_cuda, nodeVisited,countNodesSize, cudaMemcpyHostToDevice);
    // CUDA: Accessing nodeGate_cuda
    cudaMalloc (&nodeGate_cuda, countNodesSize);
    cudaMemcpy(nodeGate_cuda, nodeGate, countNodesSize, cudaMemcpyHostToDevice);
    // CUDA: Accessing nodeInput_cuda
    cudaMalloc (&nodeInput_cuda, countNodesSize);
    cudaMemcpy(nodeInput_cuda, nodeInput, countNodesSize, cudaMemcpyHostToDevice);
    // CUDA: Accessing nodeOutput_cuda
    cudaMalloc (&nodeOutput_cuda, countNodesSize);
    cudaMemcpy(nodeOutput_cuda, nodeOutput, countNodesSize, cudaMemcpyHostToDevice);

    // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    // step 4: time parallel execution
    // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

    cudaEvent_t startGPU, stopGPU;
    cudaEventCreate(&startGPU);
    cudaEventCreate(&stopGPU);

    cudaEventRecord(startGPU);


    // kernel call
    global_queuing_kernel <<< numBlocks, blockSize >>> (blockSize * numBlocks, countNodes, nodePtrs_cuda, currLevelNodes_cuda, nodeNeighbors_cuda, nodeVisited_cuda, nodeGate_cuda, nodeInput_cuda, nodeOutput_cuda);

    cudaDeviceSynchronize();

    cudaEventRecord(stopGPU);
    cudaEventSynchronize(stopGPU);


    float timeGPU;
    cudaEventElapsedTime(&timeGPU, startGPU, stopGPU);

    printf("Global Queue: %.6f ms\n", timeGPU);

    cudaEventDestroy(startGPU);
    cudaEventDestroy(stopGPU);
    
    //cudaGetLastError();

    cudaMemcpyFromSymbol(&numNextLevelNodes, numNextLevelNodes, sizeof(int), 0, cudaMemcpyDeviceToHost);
    cudaMemcpyFromSymbol(nextLevelNodes,globalQueue, countNodesSize);

    // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    // step 5: write to file and done!
    // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

    int *output_buff;
    output_buff = (int*)malloc( countNodesSize); 
    cudaMemcpy(output_buff, nodeOutput_cuda, countNodesSize, cudaMemcpyDeviceToHost);

    // Opening output files
    FILE *outfile_node = fopen(output_node, "w"); FILE *outfile_nln = fopen(output_nln, "w");
    
    // File Check
    if(!outfile_node || !outfile_nln){
        return fprintf(stderr, "Invalid Output Files");
    } 

    //Writing values to output_nodeOutput
    fprintf(outfile_node, "%d\n", countNodes);
    for (int i = 0; i < countNodes; i++) { fprintf(outfile_node, "%d\n", output_buff[i]); }
    fclose(outfile_node);
    
    //Writing values to output_nextLevelNodes
    fprintf(outfile_nln, "%d\n", numNextLevelNodes);
    for (int i = 0; i < numNextLevelNodes; i++) { fprintf(outfile_nln, "%d\n", nextLevelNodes[i]); }
    fclose(outfile_nln);

    // ~~~~~~~~~~~~~~~~~~~~~
    // step 6: free at last!
    // ~~~~~~~~~~~~~~~~~~~~~
    free(nodeGate); free(nodeInput); free(nodeOutput);
    free(nodePtrs); free(currLevelNodes); free(nodeNeighbors); free(nodeVisited);
    cudaFree(currLevelNodes_cuda); cudaFree(nodeNeighbors_cuda); cudaFree(nodePtrs_cuda); 
    cudaFree(nodeVisited_cuda); cudaFree(nodeInput_cuda); cudaFree(nodeOutput_cuda);
    cudaFree(nodeGate_cuda);
}

Overwriting global_queuing.cu


### Compile Source File - global_queueing.cu

In [40]:
!nvcc global_queuing.cu -o global_queuing
!./global_queuing input1.raw input2.raw input3.raw input4.raw output_nodeOutput.raw output_nextLevelNodes.raw

In file included from [01m[Kglobal_queuing.cu:12:0[m[K:
  [01;35m[K^~~~~~~[m[K

In file included from [01m[Kglobal_queuing.cu:12:0[m[K:
  [01;35m[K^~~~~~~[m[K
Global Queue: 0.703232 ms


### Compare Output Files

In [41]:
!./compareNodeOutput output_nodeOutput.raw sol_nodeOutput.raw
!./compareNextLevelNodes output_nextLevelNodes.raw sol_nextLevelNodes.raw

Total Errors : 0	No errors!


## Part 3: Block Queuing



In [42]:
%%writefile block_queuing.cu

#ifndef CUDACC
#define CUDACC
#endif

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <cuda.h>
#include <device_functions.h>
#include <cuda_runtime_api.h>

#include <stdio.h>
#include <stdlib.h>

#define AND 0
#define OR 1
#define NAND 2
#define NOR 3
#define XOR 4
#define XNOR 5

/*
helper functions to read and write data
*/
int read_input(int** input_vals, char* path_inputfile);
int read_third_file(char* path_inputfile, int** first_input, int** second_input, int** third_input, int** fourth_input);
void write_data(int* data, int length, char* filepath);

__device__ int numNextLevelNodes = 0;
__device__ int nextLevelNodesQueue[5000000];

/*
helper device function to solve a given logic gate and inputs
*/
__device__ int gate_solver(int gate, int x1, int x2) {
    switch (gate) {
    case AND:
        return x1 && x2;
    case OR:
        return x1 || x2;
    case NAND:
        return !(x1 && x2);
    case NOR:
        return !(x1 || x2);
    case XOR:
        return (x1 || x2) && !(x1 && x2);
    case XNOR:
        return (x1 && x2) || (!x1 && !x2);
    }
}

__global__ void block_queuing_kernel(int numCurrLevelNodes, int* currLevelNodes, int* nodeNeighbors, int* nodePtrs, int* nodeVisited, int* nodeInput, int* nodeOutput, int* nodeGate, int queueSize){
    
    // initialize shared memory queue
    extern __shared__ int sharedBlockQueue[];
    __shared__ int sharedBlockQueueSize, blockGlobalQueueIdx;

    if (threadIdx.x == 0)
        sharedBlockQueueSize = 0; 

    __syncthreads();

    int threadIndex = threadIdx.x + (blockDim.x * blockIdx.x);

    // Loop over all nodes in the current level
    for (int id = threadIndex; id < numCurrLevelNodes; id++) {
        int nodeIdx = currLevelNodes[id];      
        //Loop over all neighbors of the node
        for (int nId = nodePtrs[nodeIdx]; nId < nodePtrs[nodeIdx+1]; nId++) {          
            int neighborIdx = nodeNeighbors[nId];
            // If the neighbor hasn’t been visited yet
            const int visited = atomicExch(&(nodeVisited[neighborIdx]), 1);
            if (!(visited)) {
                const int queueIdx = atomicAdd(&sharedBlockQueueSize, 1);
                // Solve Gate
                nodeOutput[neighborIdx] = gate_solver(nodeGate[neighborIdx], nodeOutput[nodeIdx], nodeInput[neighborIdx]);
                // if not full add to block queue 
                if (queueIdx < queueSize){
                  sharedBlockQueue[queueIdx] = neighborIdx;
                }                    
                else { // else, add to global queue
                    sharedBlockQueueSize = queueSize;
                    const int GlIdx = atomicAdd(&numNextLevelNodes, 1);
                    nextLevelNodesQueue[GlIdx] = neighborIdx; 
                }
            }      
        }
    }
    
    __syncthreads();
    if (threadIdx.x == 0){
      blockGlobalQueueIdx = atomicAdd(&numNextLevelNodes, sharedBlockQueueSize);
    } 
    __syncthreads();

    // storing block queue in global queue
    for (int i = threadIdx.x; i < sharedBlockQueueSize; i += blockDim.x)
        nextLevelNodesQueue[blockGlobalQueueIdx + i] = sharedBlockQueue[i];
}

int main(int argc, char *argv[]){

    // ~~~~~~~~~~~~~~~~~~~~~~~
    // step 1: parse arguments
    // ~~~~~~~~~~~~~~~~~~~~~~~

    //User Input 
    const int blockSize = atoi(argv[1]); const int numBlocks = atoi(argv[2]); const int queueSize = atoi(argv[3]);
    char* input_1 = argv[4]; char* input_2 = argv[5]; char* input_3 = argv[6]; char* input_4 = argv[7];
    char* output_node = argv[8]; char* output_nln = argv[9];
    
    // Node Variables
    int countNodes = 0; int numNodePtrs = 0; int *nodePtrs;
    int *nodeGate; int *nodeInput; int *nodeOutput;
    
    // Node Neighbours Variables
    int *nodeNeighbors; int countNodeNeighbors = 0; int *nodeVisited; 

    //Node level variables
    int *currLevelNodes; int numCurrLevelNodes = 0; //Current Level
    int *nextLevelNodes; int numNextLevelNodes = 0; //Next Level


    //Argument Check
    if (argc != 10) {return fprintf(stderr, "Not Enough or Too Many Arguments!\n");}
  
    // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    // step 2: read in inputs from file
    // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

    countNodes = read_third_file(input_3, &nodeVisited, &nodeGate, &nodeInput, &nodeOutput);
    numNodePtrs = read_input(&nodePtrs, input_1);
    countNodeNeighbors = read_input(&nodeNeighbors, input_2);
    
    numCurrLevelNodes = read_input(&currLevelNodes, input_4);

    // ~~~~~~~~~~~~~~~~~~~~~~~~
    // step 3: allocate for GPU
    // ~~~~~~~~~~~~~~~~~~~~~~~~

    nextLevelNodes = (int *)malloc(countNodes*sizeof(int));
    int *nextLevelNodes_cuda = (int *)malloc(countNodes*sizeof(int));
    int numNodesSize = countNodes * sizeof(int);
    int* currLevelNodes_cuda = (int*)malloc(numCurrLevelNodes * sizeof(int)); 
    int* nodeNeighbors_cuda = (int*)malloc(countNodeNeighbors * sizeof(int)); 
    int* nodePtrs_cuda = (int*)malloc(numNodePtrs * sizeof(int)) ; 
    int* nodeVisited_cuda = (int*)malloc(numNodesSize);
    int* nodeInput_cuda = (int*)malloc(numNodesSize); 
    int* nodeOutput_cuda = (int*)malloc(numNodesSize); 
    int* nodeGate_cuda = (int*)malloc(numNodesSize);
    
    // Calling CUDA Functions

    // CUDA: Accessing nextLevelNodes_cuda
    cudaMalloc (&nextLevelNodes_cuda, numCurrLevelNodes * sizeof(int));
    cudaMemcpy(nextLevelNodes_cuda, nextLevelNodes, numCurrLevelNodes * sizeof(int), cudaMemcpyHostToDevice);
    // CUDA: Accessing currLevelNodes_cuda
    cudaMalloc (&currLevelNodes_cuda, numCurrLevelNodes * sizeof(int));
    cudaMemcpy(currLevelNodes_cuda, currLevelNodes, numCurrLevelNodes * sizeof(int), cudaMemcpyHostToDevice);
    // CUDA: Accessing nodeNeighbors_cuda
    cudaMalloc (&nodeNeighbors_cuda, countNodeNeighbors * sizeof(int));
    cudaMemcpy(nodeNeighbors_cuda, nodeNeighbors, countNodeNeighbors * sizeof(int), cudaMemcpyHostToDevice);
    // CUDA: Accessing nodePtrs_cuda
    cudaMalloc (&nodePtrs_cuda, numNodePtrs * sizeof(int));
    cudaMemcpy(nodePtrs_cuda, nodePtrs, numNodePtrs * sizeof(int), cudaMemcpyHostToDevice);
    // CUDA: Accessing nodeVisited_cuda
    cudaMalloc (&nodeVisited_cuda, numNodesSize);
    cudaMemcpy(nodeVisited_cuda, nodeVisited,numNodesSize, cudaMemcpyHostToDevice);
    // CUDA: Accessing nodeInput_cuda
    cudaMalloc (&nodeInput_cuda, numNodesSize);
    cudaMemcpy(nodeInput_cuda, nodeInput, numNodesSize, cudaMemcpyHostToDevice);
    // CUDA: Accessing nodeOutput_cuda
    cudaMalloc (&nodeOutput_cuda, numNodesSize);
    cudaMemcpy(nodeOutput_cuda, nodeOutput, numNodesSize, cudaMemcpyHostToDevice);
    // CUDA: Accessing nodeGate_cuda
    cudaMalloc (&nodeGate_cuda, numNodesSize);
    cudaMemcpy(nodeGate_cuda, nodeGate, numNodesSize, cudaMemcpyHostToDevice);

    // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    // step 4: time parallel execution
    // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

    cudaEvent_t startGPU, stopGPU;
    cudaEventCreate(&startGPU);
    cudaEventCreate(&stopGPU);

    cudaEventRecord(startGPU);

    block_queuing_kernel<<<numBlocks, blockSize, queueSize*sizeof(int)>>>(numCurrLevelNodes, currLevelNodes_cuda, nodeNeighbors_cuda, nodePtrs_cuda, nodeVisited_cuda, nodeInput_cuda, nodeOutput_cuda, nodeGate_cuda, queueSize);
    cudaDeviceSynchronize();

    cudaEventRecord(stopGPU);
    cudaEventSynchronize(stopGPU);

    float timeGPU;
    cudaEventElapsedTime(&timeGPU, startGPU, stopGPU);

    printf("Block Queue: %.6f ms\n", timeGPU);

    cudaEventDestroy(startGPU);
    cudaEventDestroy(stopGPU);

    // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    // step 5: write to file and done!
    // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

    int* output_buff;
    output_buff = (int*)malloc(numNodesSize);
    cudaMemcpy(output_buff, nodeOutput_cuda, numNodesSize, cudaMemcpyDeviceToHost);

    cudaMemcpyFromSymbol(&numNextLevelNodes, numNextLevelNodes, sizeof(int), 0, cudaMemcpyDeviceToHost);
    cudaMemcpyFromSymbol(nextLevelNodes, nextLevelNodesQueue, numNextLevelNodes * sizeof(int), 0, cudaMemcpyDeviceToHost);

    // Writing data to output files
    write_data(output_buff, countNodes, output_node); //ouput_node
    write_data(nextLevelNodes, numNextLevelNodes, output_nln); //output_NodeLevelNode

    // ~~~~~~~~~~~~~~~~~~~~~
    // step 6: free at last!
    // ~~~~~~~~~~~~~~~~~~~~~
    free(output_buff); free(nextLevelNodes);
    cudaFree(nextLevelNodes_cuda); cudaFree(currLevelNodes_cuda); cudaFree(nodeNeighbors_cuda); 
    cudaFree(nodePtrs_cuda); cudaFree(nodeVisited_cuda); cudaFree(nodeInput_cuda);
    cudaFree(nodeOutput_cuda); cudaFree(nodeGate_cuda);
}

int read_input(int** input_vals, char* path_inputfile) {
  int size_file;
  int read_value;

  FILE* read_this;
  
  //File Open
  if ((read_this = fopen(path_inputfile, "r")) == NULL){
      printf("Error: Cannot Open File!");
      exit(1);
  } 

  fscanf(read_this, "%d", &size_file);
  *input_vals = (int*) malloc(size_file * sizeof(int));

  //Reading input file values
  for (int i = 0; i < size_file; i++) {
      fscanf(read_this, "%d", &read_value);
      (*input_vals)[i] = read_value;
  }

  fclose(read_this);
  return size_file;
}

int read_third_file(char* path_inputfile, int** first_input, int** second_input, int** third_input, int** fourth_input) {
  int size_file;
  int read_value_1; int read_value_2; int read_value_3; int read_value_4;
  FILE* read_this;
  
  //File Open
  if ((read_this = fopen(path_inputfile, "r")) == NULL){
      printf("Error: Cannot Open File!");
      exit(1);
  } 

  //Allocating memory for input variables
  fscanf(read_this, "%d", &size_file);
  *first_input = (int*) malloc(size_file * sizeof(int)); *second_input = (int*) malloc(size_file * sizeof(int));
  *third_input = (int*) malloc(size_file * sizeof(int)); *fourth_input = (int*) malloc(size_file * sizeof(int));

  //Iterating through all four read values
  for (int i = 0; i < size_file; i++) {
      fscanf(read_this, "%d, %d, %d, %d", &read_value_1, &read_value_2, &read_value_3, &read_value_4);
      //Assigning read values to corresponding input variables
      (*first_input)[i] = read_value_1; (*second_input)[i] = read_value_2;
      (*third_input)[i] = read_value_3; (*fourth_input)[i] = read_value_4;
  }

  fclose(read_this);
  return size_file;
}

void write_data(int* data, int length, char* filepath) {
    FILE* fp = fopen(filepath, "w");
    fprintf(fp, "%d\n", length);

    for (int i = 0; i < length; i++) {
        fprintf(fp, "%d\n", (data[i]));
    }

    fclose(fp);
}

Overwriting block_queuing.cu


### Compile and Run Source File - block_queuing.cu

In [43]:
!nvcc block_queuing.cu -o block_queuing
!./block_queuing 64 35 64 input1.raw input2.raw input3.raw input4.raw output_nodeOutput.raw output_nextLevelNodes.raw

In file included from [01m[Kblock_queuing.cu:9:0[m[K:
  [01;35m[K^~~~~~~[m[K

In file included from [01m[Kblock_queuing.cu:9:0[m[K:
  [01;35m[K^~~~~~~[m[K
Block Queue: 39.459743 ms


### Compare Output Files

In [44]:
!./compareNodeOutput output_nodeOutput.raw sol_nodeOutput.raw
!./compareNextLevelNodes output_nextLevelNodes.raw sol_nextLevelNodes.raw

Total Errors : 0	No errors!
