diff --git a/CMakeLists.txt b/CMakeLists.txt index 82a315c20..a58744b37 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -193,7 +193,7 @@ add_test(NAME TestSSSP COMMAND test_sssp) set_tests_properties(TestSSSP PROPERTIES PASS_REGULAR_EXPRESSION "Node ID.*1.*: Label.*39.*: Predecessor.*0") -add_test(NAME TestPR COMMAND test_pr --undirected) +add_test(NAME TestPR COMMAND test_pr) set_tests_properties(TestPR PROPERTIES PASS_REGULAR_EXPRESSION "Node ID.*2.*: Page Rank.*0.357069.") diff --git a/FAQ.markdown b/FAQ.markdown deleted file mode 100644 index 1af4bb1a7..000000000 --- a/FAQ.markdown +++ /dev/null @@ -1,135 +0,0 @@ -Gunrock FAQ -=========== - -What does it do? ----------------- - -Gunrock is a fast-and-efficient graph processing library on the GPU which -provides a set of graph algorithms used in big data analytics and visualization -with high performance. It also provides a set of operators which abstract the -general operations in graph processing for other developers to build -high-performance graph algorithm prototyes with minimum programming effort. - -How does it do it? ------------------ - -Gunrock takes advantage of the immense computational power available in -commodity-level, off-the-shelf Graphics Processing Units (GPUs), originally -designed to handle the parallel computational tasks in computer graphics, to -perform graph traversal and computation in parallel on thousands of GPU's -computing cores. - -Who should want this? ---------------------- - -Gunrock is built with two kinds of users in mind: The first kind of users are -programmers who build big graph analytics and visualization project and need to -use existing graph primitives provided by Gunrock. The second kind of users -are programmers who want to use Gunrock's high-level, programmable abstraction -to express, develop, and refine their own (and often more complicated) graph -primitives. - -What is the skill set users need to use it? -------------------------------------------- - -for the first kind of users, C/C++ background is sufficient. We are also -building Gunrock as a shared library with C interfaces which can be loaded by -other languages such as Python and Julia. for the second kind of users, they -need to have the C/C++ background and also an understanding of parallel -programming, especially BSP (Bulk-Synchronous Programming) model which Gunrock -uses. - -What platforms/languages do people need to know in order to modify or integrate it with other tools? ----------------------------------------------------------------------------------------------------- - -Using the exposed interface, the users do not need to know CUDA or OpenCL to -modify or integrate Gunrock to their own tools. However, an essential -understanding of parallel programming and BSP model is necessary if one wants -to add/modify graph primitives in Gunrock. - -Why would someone want this? ----------------------------- - -The study of social networks, webgraphs, biological networks, and unstructured -meshes in scientific simulation has raised a significant demand for efficient -parallel frameworks for processing and analytics on large-scale graphs. Initial -research efforts in using GPUs for graph processing and anlytics are promising. - -How is it better than the current state of the art? ---------------------------------------------------- - -Most existing CPU large graph processing libraries perform worse on large -graphs with billions of edges. Supercomputer or expensive clusters can achieve -close to real-time feedback with high cost on hardware infrastructure. With -GPUs, we can achieve the same real-time feedback with much lower cost on -hardware. Gunrock has the best performance among the limited research efforts -put on GPU graph processing. With a set of general graph processing operators -exposed to users, it is also more flexible than other GPU/CPU graph library in -terms of programmability. - -How would someone get it? -------------------------- - -Gunrock is an open-sourced library. The code, documentation, and quick start -guide are all on its [github page](gunrock.github.io). - -Is a user account required? ---------------------------- - -No. One can use either git clone or download directly to get the source code -and documentation of Gunrock. - -Are all of its components/dependencies easy to find? ----------------------------------------------------- - -Gunrock has three dependencies. Two of them are also GPU primitive library which -also reside on github. The third one is Boost (Gunrock uses Boost Graph Library -to implement CPU reference testing algorithms). All dependencies do not require -installation. To use, one only needs to download or git clone them and put them -in the according directories. More details in the installation section of this -documentation. - -How would someone install it? ------------------------------ - -For C/C++ programmer, integrating Gunrock into your projects is easy. Since it -is a template based library, just add the include files in your code. The -simple example and all the testrigs will provide detailed information on how to -do this. - -For programmers who use Python, Julia, or other language and want to call -Gunrock APIs, we are building a shared library with binary compatible -C interfaces. It will be included in the soon-to-arrive next release of -Gunrock. - -Can anyone install it? Do they need IT help? --------------------------------------------- - -Gunrock is targeted at developers who are familiar with basic software -engineering. For non-technical people, IT help might needed. - -Does this process actually work? All the time? On all systems specified? ------------------------------------------------------------------------- -Currently, Gunrock has been tested on two Linux distributions: Linux Mint and -Ubuntu. But we expect it to run correctly on other Linux distributions too. -We are currently building a Cmake solution to port Gunrock to Mac and Windows. -The feature will be included in the soon-to-arrive next release of Gunrock. - -How would someone test that it's working with provided sample data? -------------------------------------------------------------------- - -Testrigs are provided as well as a small simple example for users to test the -correctness and performance of every graph primitive. - -Is the "using" of sample data clear? ------------------------------------- - -On linux, one only needs to go to the dataset directory and run "make", the -script will automatically download all the needed datasets. One can also choose -to download a single dataset in its separated directory. - -How would someone use it with their own data? ---------------------------------------------- - -Gunrock supports Matrix Market (.mtx) file format, users need to pre-process -the graph data into this format before running Gunrock. diff --git a/dataset/small/test_mst.mtx b/dataset/small/test_mst.mtx new file mode 100644 index 000000000..a6949843f --- /dev/null +++ b/dataset/small/test_mst.mtx @@ -0,0 +1,18 @@ +9 9 17 +1 2 2 +2 3 2 +2 4 17 +3 1 2 +3 4 38 +3 5 10 +4 5 2 +5 1 82 +5 2 11 +6 3 100 +6 4 100 +6 5 210 +6 7 2 +6 8 21 +7 3 120 +7 5 110 +8 9 2 \ No newline at end of file diff --git a/gunrock/CMakeLists.txt b/gunrock/CMakeLists.txt index 625225964..0f1d8d6ef 100644 --- a/gunrock/CMakeLists.txt +++ b/gunrock/CMakeLists.txt @@ -26,6 +26,7 @@ set(CUFILES app/cc/cc_app.cu app/sssp/sssp_app.cu app/pr/pr_app.cu + app/mst/mst_app.cu util/test_utils.cu util/error_utils.cu ${mgpu_SOURCE_FILES}) diff --git a/gunrock/app/bc/bc_app.cu b/gunrock/app/bc/bc_app.cu index fb70e9d11..353d107ba 100644 --- a/gunrock/app/bc/bc_app.cu +++ b/gunrock/app/bc/bc_app.cu @@ -8,16 +8,15 @@ /** * @file bc_app.cu * - * @brief Gunrock Betweeness Centrality Implementation + * @brief Gunrock betweeness centrality (BC) application */ -#include #include -// Graph construction utils +// graph construction utilities #include -// BC includes +// betweeness centrality includes #include #include #include @@ -36,150 +35,117 @@ using namespace gunrock::app::bc; * @tparam Value * @tparam SizeT * - * @param[out] ggraph_out Pointer to the output CSR graph object - * @param[in] graph Reference to the CSR graph object defined in main driver + * @param[out] graph_o Pointer to the output CSR graph object + * @param[in] csr Reference to the CSR graph object defined in main driver * @param[in] source * @param[in] max_grid_size * @param[in] num_gpus * @param[in] max_queue_sizing * @param[in] context Reference to CudaContext used by moderngpu functions */ -template < - typename VertexId, - typename Value, - typename SizeT > +template void run_bc( - GunrockGraph *ggraph_out, - const Csr &graph, - VertexId source, - int max_grid_size, - int num_gpus, - double max_queue_sizing, - CudaContext& context) { - typedef BCProblem < - VertexId, - SizeT, - Value, - true, // MARK_PREDECESSORS - false > Problem; //does not use double buffer - + GRGraph* graph_o, + const Csr& csr, + const VertexId source, + const int max_grid_size, + const int num_gpus, + const double max_queue_sizing, + CudaContext& context) { + typedef BCProblem Problem; // Allocate host-side array (for both reference and gpu-computed results) - Value *h_sigmas = (Value*)malloc(sizeof(Value) * graph.nodes); - Value *h_bc_values = (Value*)malloc(sizeof(Value) * graph.nodes); - Value *h_ebc_values = (Value*)malloc(sizeof(Value) * graph.edges); - - // Allocate BC enactor map - BCEnactor bc_enactor(false); - - // Allocate problem on GPU - Problem *csr_problem = new Problem; - util::GRError(csr_problem->Init( - false, - graph, - num_gpus), + Value *h_sigmas = (Value*)malloc(sizeof(Value) * csr.nodes); + Value *h_bc_values = (Value*)malloc(sizeof(Value) * csr.nodes); + Value *h_ebc_values = (Value*)malloc(sizeof(Value) * csr.edges); + BCEnactor enactor(false); // Allocate BC enactor map + Problem *problem = new Problem; // Allocate problem on GPU + + util::GRError(problem->Init(false, csr, num_gpus), "BC Problem Initialization Failed", __FILE__, __LINE__); - // Perform BC - GpuTimer gpu_timer; + GpuTimer gpu_timer; float elapsed = 0.0f; gpu_timer.Start(); // start VertexId start_source; VertexId end_source; if (source == -1) { start_source = 0; - end_source = graph.nodes; + end_source = csr.nodes; } else { start_source = source; end_source = source + 1; } - gpu_timer.Start(); for (VertexId i = start_source; i < end_source; ++i) { - util::GRError(csr_problem->Reset( - i, bc_enactor.GetFrontierType(), max_queue_sizing), + util::GRError(problem->Reset( + i, enactor.GetFrontierType(), max_queue_sizing), "BC Problem Data Reset Failed", __FILE__, __LINE__); - util::GRError(bc_enactor.template Enact( - context, csr_problem, i, max_grid_size), + util::GRError(enactor.template Enact( + context, problem, i, max_grid_size), "BC Problem Enact Failed", __FILE__, __LINE__); } util::MemsetScaleKernel <<< 128, 128>>>( - csr_problem->data_slices[0]->d_bc_values, (Value)0.5f, (int)graph.nodes); - - gpu_timer.Stop(); + problem->data_slices[0]->d_bc_values, (Value)0.5f, (int)csr.nodes); - float elapsed = gpu_timer.ElapsedMillis(); + gpu_timer.Stop(); elapsed = gpu_timer.ElapsedMillis(); // elapsed time + printf(" device elapsed time: %.4f ms\n", elapsed); - //double avg_duty = 0.0; - //bc_enactor.GetStatistics(avg_duty); - - // Copy out results to Host Device - util::GRError(csr_problem->Extract(h_sigmas, h_bc_values, h_ebc_values), + util::GRError(problem->Extract(h_sigmas, h_bc_values, h_ebc_values), "BC Problem Data Extraction Failed", __FILE__, __LINE__); - // copy h_bc_values per node to GunrockGraph output - ggraph_out->node_values = (float*)&h_bc_values[0]; - // copy h_ebc_values per edge to GunrockGraph output - ggraph_out->edge_values = (float*)&h_ebc_values[0]; - - printf("GPU Betweeness Centrality finished in %lf msec.\n", elapsed); - - // Cleanup - if (csr_problem) delete csr_problem; - //if (h_sigmas) free(h_sigmas); - //if (h_bc_values) free(h_bc_values); + graph_o->node_values = (float*)&h_bc_values[0]; // h_bc_values per node + graph_o->edge_values = (float*)&h_ebc_values[0]; // h_ebc_values per edge + if (problem) { delete problem; } cudaDeviceSynchronize(); } /** * @brief dispatch function to handle data_types * - * @param[out] ggraph_out GunrockGraph type output - * @param[in] ggraph_in GunrockGraph type input graph - * @param[in] bc_config bc specific configurations - * @param[in] data_type bc data_type configurations - * @param[in] context moderngpu context + * @param[out] graph_o GRGraph type output + * @param[in] graph_i GRGraph type input graph + * @param[in] config Specific configurations + * @param[in] data_t Data type configurations + * @param[in] context ModernGPU context */ void dispatch_bc( - GunrockGraph *ggraph_out, - const GunrockGraph *ggraph_in, - GunrockConfig bc_config, - GunrockDataType data_type, - CudaContext& context) { - switch (data_type.VTXID_TYPE) { + GRGraph *graph_o, + const GRGraph *graph_i, + const GRSetup config, + const GRTypes data_t, + CudaContext &context) { + switch (data_t.VTXID_TYPE) { case VTXID_INT: { - switch (data_type.SIZET_TYPE) { + switch (data_t.SIZET_TYPE) { case SIZET_INT: { - switch (data_type.VALUE_TYPE) { - case VALUE_INT: { - // template type = + switch (data_t.VALUE_TYPE) { + case VALUE_INT: { // template type = // not support yet printf("Not Yet Support This DataType Combination.\n"); break; } - case VALUE_UINT: { - // template type = + case VALUE_UINT: { // template type = // not support yet printf("Not Yet Support This DataType Combination.\n"); break; } - case VALUE_FLOAT: { - // template type = + case VALUE_FLOAT: { // template type = // build input csr format graph Csr csr_graph(false); - csr_graph.nodes = ggraph_in->num_nodes; - csr_graph.edges = ggraph_in->num_edges; - csr_graph.row_offsets = (int*)ggraph_in->row_offsets; - csr_graph.column_indices = (int*)ggraph_in->col_indices; + csr_graph.nodes = graph_i->num_nodes; + csr_graph.edges = graph_i->num_edges; + csr_graph.row_offsets = (int*)graph_i->row_offsets; + csr_graph.column_indices = (int*)graph_i->col_indices; // bc configurations - int src_node = -1; //!< Use whatever the specified graph-type's default is - int max_grid_size = 0; //!< maximum grid size (0: leave it up to the enactor) - int num_gpus = 1; //!< Number of GPUs for multi-gpu enactor to use - float max_queue_sizing = 1.0; //!< Maximum size scaling factor for work queues + int src_node = -1; // default source vertex to start + int max_grid_size = 0; // leave it up to the enactor + int num_gpus = 1; // Number of GPUs for multi-gpu + float max_queue_sizing = 1.0; // Maximum size scaling factor // determine source vertex to start bc - switch (bc_config.src_mode) { + switch (config.src_mode) { case randomize: { src_node = graphio::RandomNode(csr_graph.nodes); break; @@ -190,7 +156,7 @@ void dispatch_bc( break; } case manually: { - src_node = bc_config.src_node; + src_node = config.src_node; break; } default: { @@ -198,11 +164,11 @@ void dispatch_bc( break; } } - max_queue_sizing = bc_config.queue_size; + max_queue_sizing = config.queue_size; // lunch bc function run_bc( - ggraph_out, + graph_o, csr_graph, src_node, max_grid_size, @@ -227,29 +193,68 @@ void dispatch_bc( /* * @brief gunrock_bc function * - * @param[out] ggraph_out output of bc problem - * @param[in] ggraph_in input graph need to process on - * @param[in] bc_config gunrock primitive specific configurations - * @param[in] data_type gunrock datatype struct + * @param[out] graph_o output of bc problem + * @param[in] graph_i input graph need to process on + * @param[in] config gunrock primitive specific configurations + * @param[in] data_t gunrock data_t struct */ -void gunrock_bc_func( - GunrockGraph *ggraph_out, - const GunrockGraph *ggraph_in, - GunrockConfig bc_config, - GunrockDataType data_type) { - - // moderngpu preparations - int device = 0; - device = bc_config.device; +void gunrock_bc( + GRGraph *graph_o, + const GRGraph *graph_i, + const GRSetup config, + const GRTypes data_t) { + unsigned int device = 0; + device = config.device; ContextPtr context = mgpu::CreateCudaDevice(device); + dispatch_bc(graph_o, graph_i, config, data_t, *context); +} + +/* + * @brief Simple interface take in CSR arrays as input + * @param[out] bfs_label Return BC node centrality per nodes + * @param[in] num_nodes Number of nodes of the input graph + * @param[in] num_edges Number of edges of the input graph + * @param[in] row_offsets CSR-formatted graph input row offsets + * @param[in] col_indices CSR-formatted graph input column indices + * @param[in] source Source to begin traverse + */ +void bc( + float* bc_scores, + const int num_nodes, + const int num_edges, + const int* row_offsets, + const int* col_indices, + const int source) { + printf("-------------------- setting --------------------\n"); + + struct GRTypes data_t; // primitive-specific data types + data_t.VTXID_TYPE = VTXID_INT; // integer + data_t.SIZET_TYPE = SIZET_INT; // integer + data_t.VALUE_TYPE = VALUE_FLOAT; // float BC scores + + struct GRSetup config; // primitive-specific configures + config.device = 0; // setting device to run + config.src_node = source; // source vertex to begin + config.queue_size = 1.0f; // maximum queue size factor + + struct GRGraph *graph_o = (struct GRGraph*)malloc(sizeof(struct GRGraph)); + struct GRGraph *graph_i = (struct GRGraph*)malloc(sizeof(struct GRGraph)); + + graph_i->num_nodes = num_nodes; + graph_i->num_edges = num_edges; + graph_i->row_offsets = (void*)&row_offsets[0]; + graph_i->col_indices = (void*)&col_indices[0]; + + printf(" loaded %d nodes and %d edges\n", num_nodes, num_edges); + + printf("-------------------- running --------------------\n"); + gunrock_bc(graph_o, graph_i, config, data_t); + memcpy(bc_scores, (float*)graph_o->node_values, num_nodes * sizeof(float)); + + if (graph_i) free(graph_i); + if (graph_o) free(graph_o); - // lunch dispatch function - dispatch_bc( - ggraph_out, - ggraph_in, - bc_config, - data_type, - *context); + printf("------------------- completed -------------------\n"); } // Leave this at the end of the file diff --git a/gunrock/app/bc/bc_enactor.cuh b/gunrock/app/bc/bc_enactor.cuh index 59b1cf136..65f5f7e0c 100644 --- a/gunrock/app/bc/bc_enactor.cuh +++ b/gunrock/app/bc/bc_enactor.cuh @@ -988,7 +988,7 @@ public: Problem, // Problem data type 300, // CUDA_ARCH INSTRUMENT, // INSTRUMENT - 8, // MIN_CTA_OCCUPANCY + 1, // MIN_CTA_OCCUPANCY 10, // LOG_THREADS 8, // LOG_BLOCKS 32*128, // LIGHT_EDGE_THRESHOLD (used for partitioned advance mode) diff --git a/gunrock/app/bc/bc_functor.cuh b/gunrock/app/bc/bc_functor.cuh index 00c9d2f1e..3ec577ac3 100644 --- a/gunrock/app/bc/bc_functor.cuh +++ b/gunrock/app/bc/bc_functor.cuh @@ -161,7 +161,6 @@ struct BackwardFunctor */ static __device__ __forceinline__ bool CondEdge(VertexId s_id, VertexId d_id, DataSlice *problem, VertexId e_id = 0, VertexId e_id_in = 0) { - VertexId s_label; VertexId d_label; util::io::ModifiedLoad::Ld( @@ -304,7 +303,7 @@ struct BackwardFunctor2 util::io::ModifiedLoad::Ld( to_delta, problem->deltas + d_id); - //Value result = from_sigma / to_sigma * (1.0 + to_delta); + Value result = from_sigma / to_sigma * (1.0 + to_delta); //Accumulate delta value diff --git a/gunrock/app/bfs/bfs_app.cu b/gunrock/app/bfs/bfs_app.cu index 026c7d6fc..b6313fffa 100644 --- a/gunrock/app/bfs/bfs_app.cu +++ b/gunrock/app/bfs/bfs_app.cu @@ -8,21 +8,19 @@ /** * @file bfs_app.cu * - * @brief Gunrock Breadth-First Search implementation + * @brief Gunrock breadth-first search (BFS) application */ -#include #include -// Graph construction utils +// graph construction utilities #include -// BFS includes +// breadth-first search includes #include #include #include -// MGPU include #include using namespace gunrock; @@ -39,8 +37,8 @@ using namespace gunrock::app::bfs; * @tparam MARK_PREDECESSORS * @tparam ENABLE_IDEMPOTENCE * - * @param[out] ggraph_out Pointer to the output CSR graph - * @param[in] ggraph_in Reference to the CSR graph we process on + * @param[out] graph_o Pointer to the output CSR graph + * @param[in] graph_i Reference to the CSR graph we process on * @param[in] src Source node where BFS starts * @param[in] max_grid_size Maximum CTA occupancy * @param[in] num_gpus Number of GPUs @@ -48,115 +46,89 @@ using namespace gunrock::app::bfs; * @param[in] context Reference to CudaContext used by moderngpu functions * */ -template < - typename VertexId, - typename Value, - typename SizeT, - bool MARK_PREDECESSORS, - bool ENABLE_IDEMPOTENCE > +template void run_bfs( - GunrockGraph *ggraph_out, - const Csr &ggraph_in, - const VertexId src, - int max_grid_size, - int num_gpus, - double max_queue_sizing, - CudaContext& context) { - // Preparations - typedef BFSProblem < - VertexId, - SizeT, - Value, - MARK_PREDECESSORS, - ENABLE_IDEMPOTENCE, - (MARK_PREDECESSORS && ENABLE_IDEMPOTENCE) > Problem; - - // Allocate host-side label array for gpu-computed results - VertexId *h_labels = (VertexId*)malloc(sizeof(VertexId) * ggraph_in.nodes); + GRGraph* graph_o, + const Csr& csr, + const VertexId src, + const int num_gpus, + const double max_queue_sizing, + CudaContext& context) { + typedef BFSProblem Problem; + // Allocate host-side label array for GPU-computed results + VertexId *h_labels = (VertexId*)malloc(sizeof(VertexId) * csr.nodes); VertexId *h_preds = NULL; if (MARK_PREDECESSORS) { - //h_preds = (VertexId*)malloc(sizeof(VertexId) * ggraph_in.nodes); + // h_preds = (VertexId*)malloc(sizeof(VertexId) * csr.nodes); } - // Allocate BFS enactor map - BFSEnactor bfs_enactor(false); - - // Allocate problem on GPU - Problem *csr_problem = new Problem; - util::GRError(csr_problem->Init( - false, - ggraph_in, - num_gpus), - "Problem BFS Initialization Failed", __FILE__, __LINE__); + BFSEnactor enactor(false); // Allocate BFS enactor map + Problem *problem = new Problem; // Allocate problem on GPU - // Perform BFS - GpuTimer gpu_timer; + util::GRError(problem->Init(false, csr, num_gpus), + "BFS Problem Initialization Failed", __FILE__, __LINE__); - util::GRError(csr_problem->Reset( - src, bfs_enactor.GetFrontierType(), max_queue_sizing), + util::GRError(problem->Reset( + src, enactor.GetFrontierType(), max_queue_sizing), "BFS Problem Data Reset Failed", __FILE__, __LINE__); - gpu_timer.Start(); - util::GRError(bfs_enactor.template Enact( - context, csr_problem, src, max_grid_size), + GpuTimer gpu_timer; float elapsed = 0.0f; gpu_timer.Start(); // start + + util::GRError(enactor.template Enact(context, problem, src), "BFS Problem Enact Failed", __FILE__, __LINE__); - gpu_timer.Stop(); - float elapsed = gpu_timer.ElapsedMillis(); + gpu_timer.Stop(); elapsed = gpu_timer.ElapsedMillis(); // elapsed time + printf(" device elapsed time: %.4f ms\n", elapsed); - // Copy out results back to Host - util::GRError(csr_problem->Extract(h_labels, h_preds), + util::GRError(problem->Extract(h_labels, h_preds), "BFS Problem Data Extraction Failed", __FILE__, __LINE__); - // label per node to GunrockGraph struct - ggraph_out->node_values = (int*)&h_labels[0]; - - // Clean up - if (csr_problem) delete csr_problem; - //if (h_preds) free(h_preds); + graph_o->node_values = (int*)&h_labels[0]; // label per node to graph_o + if (problem) { delete problem; } + if (h_preds) { free(h_preds); } cudaDeviceSynchronize(); } /** * @brief dispatch function to handle data_types * - * @param[out] ggraph_out GunrockGraph type output - * @param[in] ggraph_in GunrockGraph type input graph - * @param[in] bfs_config bfs specific configurations - * @param[in] data_type bfs data_type configurations - * @param[in] context moderngpu context + * @param[out] graph_o GRGraph type output + * @param[in] graph_i GRGraph type input graph + * @param[in] config Specific configurations + * @param[in] data_t Data type configurations + * @param[in] context ModernGPU context */ void dispatch_bfs( - GunrockGraph *ggraph_out, - const GunrockGraph *ggraph_in, - GunrockConfig bfs_config, - GunrockDataType data_type, - CudaContext& context) { - switch (data_type.VTXID_TYPE) { + GRGraph* graph_o, + const GRGraph* graph_i, + const GRSetup config, + const GRTypes data_t, + CudaContext& context) { + switch (data_t.VTXID_TYPE) { case VTXID_INT: { - switch (data_type.SIZET_TYPE) { + switch (data_t.SIZET_TYPE) { case SIZET_INT: { - switch (data_type.VALUE_TYPE) { - case VALUE_INT: { - // template type = + switch (data_t.VALUE_TYPE) { + case VALUE_INT: { // template type = // build input csr format graph Csr csr_graph(false); - csr_graph.nodes = ggraph_in->num_nodes; - csr_graph.edges = ggraph_in->num_edges; - csr_graph.row_offsets = (int*)ggraph_in->row_offsets; - csr_graph.column_indices = (int*)ggraph_in->col_indices; + csr_graph.nodes = graph_i->num_nodes; + csr_graph.edges = graph_i->num_edges; + csr_graph.row_offsets = (int*)graph_i->row_offsets; + csr_graph.column_indices = (int*)graph_i->col_indices; // default configurations - int src_node = 0; //!< default source vertex to start - int num_gpus = 1; //!< number of GPUs for multi-gpu enactor to use - int max_grid_size = 0; //!< maximum grid size (0: leave it up to the enactor) - bool mark_pred = false; //!< whether to mark predecessor or not - bool idempotence = false; //!< whether or not to enable idempotence - float max_queue_sizing = 1.0f; //!< maximum size scaling factor for work queues + int src_node = 0; // default source vertex to start + int num_gpus = 1; // number of GPUs for multi-GPU + bool mark_pred = 0; // whether to mark predecessor or not + bool idempotence = 0; // whether or not enable idempotent + float max_queue_sizing = 1.0f; // maximum size scaling factor - // determine source vertex to start bfs - switch (bfs_config.src_mode) { + // determine source vertex to start + switch (config.src_mode) { case randomize: { src_node = graphio::RandomNode(csr_graph.nodes); break; @@ -167,7 +139,7 @@ void dispatch_bfs( break; } case manually: { - src_node = bfs_config.src_node; + src_node = config.src_node; break; } default: { @@ -175,26 +147,24 @@ void dispatch_bfs( break; } } - mark_pred = bfs_config.mark_pred; - idempotence = bfs_config.idempotence; - max_queue_sizing = bfs_config.queue_size; + mark_pred = config.mark_pred; + idempotence = config.idempotence; + max_queue_sizing = config.queue_size; if (mark_pred) { if (idempotence) { run_bfs( - ggraph_out, + graph_o, csr_graph, src_node, - max_grid_size, num_gpus, max_queue_sizing, context); } else { run_bfs( - ggraph_out, + graph_o, csr_graph, src_node, - max_grid_size, num_gpus, max_queue_sizing, context); @@ -202,19 +172,17 @@ void dispatch_bfs( } else { if (idempotence) { run_bfs( - ggraph_out, + graph_o, csr_graph, src_node, - max_grid_size, num_gpus, max_queue_sizing, context); } else { run_bfs( - ggraph_out, + graph_o, csr_graph, src_node, - max_grid_size, num_gpus, max_queue_sizing, context); @@ -225,14 +193,12 @@ void dispatch_bfs( csr_graph.column_indices = NULL; break; } - case VALUE_UINT: { - // template type = + case VALUE_UINT: { // template type = // not yet support printf("Not Yet Support This DataType Combination.\n"); break; } - case VALUE_FLOAT: { - // template type = + case VALUE_FLOAT: { // template type = // not yet support printf("Not Yet Support This DataType Combination.\n"); break; @@ -249,24 +215,70 @@ void dispatch_bfs( /* * @brief gunrock_bfs function * - * @param[out] ggraph_out output subgraph of bfs problem - * @param[in] ggraph_in input graph need to process on - * @param[in] bfs_config gunrock primitive specific configurations - * @param[in] data_type gunrock datatype struct + * @param[out] graph_o output subgraph of the problem + * @param[in] graph_i input graph need to process on + * @param[in] config gunrock primitive specific configurations + * @param[in] data_t gunrock data_t struct */ -void gunrock_bfs_func( - GunrockGraph *ggraph_out, - const GunrockGraph *ggraph_in, - GunrockConfig bfs_config, - GunrockDataType data_type) { - - // moderngpu preparations - int device = 0; - device = bfs_config.device; +void gunrock_bfs( + GRGraph* graph_o, + const GRGraph* graph_i, + const GRSetup config, + const GRTypes data_t) { + unsigned int device = 0; + device = config.device; ContextPtr context = mgpu::CreateCudaDevice(device); + dispatch_bfs(graph_o, graph_i, config, data_t, *context); +} + +/* + * @brief Simple interface take in CSR arrays as input + * @param[out] bfs_label Return BFS labels per nodes + * @param[in] num_nodes Number of nodes of the input graph + * @param[in] num_edges Number of edges of the input graph + * @param[in] row_offsets CSR-formatted graph input row offsets + * @param[in] col_indices CSR-formatted graph input column indices + * @param[in] source Source to begin traverse + */ +void bfs( + int* bfs_label, + const int num_nodes, + const int num_edges, + const int* row_offsets, + const int* col_indices, + const int source) { + printf("-------------------- setting --------------------\n"); + + struct GRTypes data_t; // primitive-specific data types + data_t.VTXID_TYPE = VTXID_INT; // integer + data_t.SIZET_TYPE = SIZET_INT; // integer + data_t.VALUE_TYPE = VALUE_INT; // integer + + struct GRSetup config; // primitive-specific configures + config.device = 0; // setting device to run + config.src_node = source; // source vertex to begin + config.mark_pred = false; // do not mark predecessors + config.idempotence = false; // whether enable idempotent + config.queue_size = 1.0f; // maximum queue size factor + + struct GRGraph *graph_o = (struct GRGraph*)malloc(sizeof(struct GRGraph)); + struct GRGraph *graph_i = (struct GRGraph*)malloc(sizeof(struct GRGraph)); + + graph_i->num_nodes = num_nodes; + graph_i->num_edges = num_edges; + graph_i->row_offsets = (void*)&row_offsets[0]; + graph_i->col_indices = (void*)&col_indices[0]; + + printf(" loaded %d nodes and %d edges\n", num_nodes, num_edges); + + printf("-------------------- running --------------------\n"); + gunrock_bfs(graph_o, graph_i, config, data_t); + memcpy(bfs_label, (int*)graph_o->node_values, num_nodes * sizeof(int)); + + if (graph_i) free(graph_i); + if (graph_o) free(graph_o); - // launch dispatch function - dispatch_bfs(ggraph_out, ggraph_in, bfs_config, data_type, *context); + printf("------------------- completed -------------------\n"); } // Leave this at the end of the file diff --git a/gunrock/app/cc/cc_app.cu b/gunrock/app/cc/cc_app.cu index 97723087c..b4ac393b6 100644 --- a/gunrock/app/cc/cc_app.cu +++ b/gunrock/app/cc/cc_app.cu @@ -8,20 +8,15 @@ /** * @file cc_app.cu * - * @brief connected component implementation. + * @brief connected component (CC) application */ -#include -#include -#include -#include -#include #include -// Graph construction utils +// graph construction utilities #include -// CC includes +// connected component includes #include #include #include @@ -38,112 +33,88 @@ using namespace gunrock::app::cc; * @tparam Value * @tparam SizeT * - * @param[out] ggraph_out Pointer to output CSR graph + * @param[out] graph_o Pointer to output CSR graph * @param[in] csr_graph Reference to the CSR graph we process on * @param[in] max_grid_size Maximum CTA occupancy for CC kernels * @param[in] num_gpus Number of GPUs */ -template < - typename VertexId, - typename Value, - typename SizeT > +template void run_cc( - GunrockGraph *ggraph_out, - unsigned int *components, - const Csr &csr_graph, + GRGraph* graph_o, + unsigned int* components, + const Csr& csr, const int max_grid_size, const int num_gpus) { - - // Define CCProblem - typedef CCProblem < - VertexId, - SizeT, - Value, - true > Problem; //use double buffer + typedef CCProblem Problem; // double buffer // Allocate host-side label array for gpu-computed results VertexId *h_component_ids - = (VertexId*)malloc(sizeof(VertexId) * csr_graph.nodes); - - // Allocate CC enactor map - CCEnactor cc_enactor(false); + = (VertexId*)malloc(sizeof(VertexId) * csr.nodes); + CCEnactor cc_enactor(false); // Allocate CC enactor map + Problem *problem = new Problem; // Allocate problem on GPU - // Allocate problem on GPU - Problem *csr_problem = new Problem; - util::GRError(csr_problem->Init( - false, - csr_graph, - num_gpus), + util::GRError(problem->Init(false, csr, num_gpus), "CC Problem Initialization Failed", __FILE__, __LINE__); - // Reset CC Problem Data - util::GRError(csr_problem->Reset( + util::GRError(problem->Reset( cc_enactor.GetFrontierType()), "CC Problem Data Reset Failed", __FILE__, __LINE__); - // Perform Connected Component - GpuTimer gpu_timer; - gpu_timer.Start(); - // Lunch CC Enactor + GpuTimer gpu_timer; float elapsed = 0.0f; gpu_timer.Start(); // start + util::GRError(cc_enactor.template Enact( - csr_problem, max_grid_size), + problem, max_grid_size), "CC Problem Enact Failed", __FILE__, __LINE__); - gpu_timer.Stop(); - float elapsed = gpu_timer.ElapsedMillis(); - // Copy out results back to Host Device - util::GRError(csr_problem->Extract(h_component_ids), + gpu_timer.Stop(); elapsed = gpu_timer.ElapsedMillis(); // elapsed time + printf(" device elapsed time: %.4f ms\n", elapsed); + + util::GRError(problem->Extract(h_component_ids), "CC Problem Data Extraction Failed", __FILE__, __LINE__); // Compute number of components in graph - unsigned int temp = csr_problem->num_components; + unsigned int temp = problem->num_components; *components = temp; - // copy component_id per node to GunrockGraph struct - ggraph_out->node_values = (int*)&h_component_ids[0]; - - printf("GPU Connected Component finished in %lf msec.\n", elapsed); - - // Cleanup - if (csr_problem) delete csr_problem; + // copy component_id per node to GRGraph struct + graph_o->node_values = (int*)&h_component_ids[0]; + if (problem) delete problem; cudaDeviceSynchronize(); } /** * @brief dispatch function to handle data_types * - * @param[out] ggraph_out GunrockGraph type output - * @param[in] ggraph_in GunrockGraph type input graph - * @param[in] cc_config cc specific configurations - * @param[in] data_type data type configurations + * @param[out] graph_o GRGraph type output + * @param[in] graph_i GRGraph type input graph + * @param[in] config cc specific configurations + * @param[in] data_t data type configurations */ void dispatch_cc( - GunrockGraph *ggraph_out, - unsigned int *components, - const GunrockGraph *ggraph_in, - const GunrockConfig cc_config, - const GunrockDataType data_type) { - switch (data_type.VTXID_TYPE) { + GRGraph* graph_o, + unsigned int* components, + const GRGraph* graph_i, + const GRSetup config, + const GRTypes data_t) { + switch (data_t.VTXID_TYPE) { case VTXID_INT: { - switch (data_type.SIZET_TYPE) { + switch (data_t.SIZET_TYPE) { case SIZET_INT: { - switch (data_type.VALUE_TYPE) { - case VALUE_INT: { - // template type = + switch (data_t.VALUE_TYPE) { + case VALUE_INT: { // template type = // build input csr format graph Csr csr_graph(false); - csr_graph.nodes = ggraph_in->num_nodes; - csr_graph.edges = ggraph_in->num_edges; - csr_graph.row_offsets = (int*)ggraph_in->row_offsets; - csr_graph.column_indices = (int*)ggraph_in->col_indices; + csr_graph.nodes = graph_i->num_nodes; + csr_graph.edges = graph_i->num_edges; + csr_graph.row_offsets = (int*)graph_i->row_offsets; + csr_graph.column_indices = (int*)graph_i->col_indices; - int max_grid_size = 0; //!< 0: leave it up to the enactor - int num_gpus = 1; //!< number of GPUs + int max_grid_size = 0; // 0: leave it up to the enactor + int num_gpus = 1; // number of GPUs - // lunch cc dispatch function run_cc( - ggraph_out, + graph_o, (unsigned int*)components, csr_graph, max_grid_size, @@ -154,13 +125,11 @@ void dispatch_cc( csr_graph.column_indices = NULL; break; } - case VALUE_UINT: { - // template type = + case VALUE_UINT: { // template type = printf("Not Yet Support This DataType Combination.\n"); break; } - case VALUE_FLOAT: { - // template type = + case VALUE_FLOAT: { // template type = printf("Not Yet Support This DataType Combination.\n"); break; } @@ -176,20 +145,65 @@ void dispatch_cc( /* * @brief gunrock_cc function * - * @param[out] ggraph_out output subgraph of cc problem - * @param[in] ggraph_in input graph need to process on - * @param[in] cc_configs primitive specific configurations - * @param[in] data_type gunrock data_type struct + * @param[out] graph_o output subgraph of cc problem + * @param[in] graph_i input graph need to process on + * @param[in] config primitive specific configurations + * @param[in] data_t gunrock data_t struct + */ +void gunrock_cc( + GRGraph *graph_o, + unsigned int *components, + const GRGraph *graph_i, + const GRSetup config, + const GRTypes data_t) { + dispatch_cc(graph_o, components, graph_i, config, data_t); +} + +/* + * @brief Simple interface take in CSR arrays as input + * @param[out] components Return component ID for each node + * @param[out] num_comps Return number of components calculated + * @param[in] num_nodes Number of nodes of the input graph + * @param[in] num_edges Number of edges of the input graph + * @param[in] row_offsets CSR-formatted graph input row offsets + * @param[in] col_indices CSR-formatted graph input column indices */ -void gunrock_cc_func( - GunrockGraph *ggraph_out, - unsigned int *components, - const GunrockGraph *ggraph_in, - const GunrockConfig cc_configs, - const GunrockDataType data_type) { - - // lunch dispatch function - dispatch_cc(ggraph_out, components, ggraph_in, cc_configs, data_type); +int cc( + int* components, + const int num_nodes, + const int num_edges, + const int* row_offsets, + const int* col_indices) { + printf("-------------------- setting --------------------\n"); + + struct GRTypes data_t; // primitive-specific data types + data_t.VTXID_TYPE = VTXID_INT; // integer + data_t.SIZET_TYPE = SIZET_INT; // integer + data_t.VALUE_TYPE = VALUE_INT; // integer + + struct GRSetup config; // primitive-specific configures + config.device = 0; // setting device to run + + unsigned int num_components = 0; + struct GRGraph *graph_o = (struct GRGraph*)malloc(sizeof(struct GRGraph)); + struct GRGraph *graph_i = (struct GRGraph*)malloc(sizeof(struct GRGraph)); + + graph_i->num_nodes = num_nodes; + graph_i->num_edges = num_edges; + graph_i->row_offsets = (void*)&row_offsets[0]; + graph_i->col_indices = (void*)&col_indices[0]; + + printf(" loaded %d nodes and %d edges\n", num_nodes, num_edges); + + printf("-------------------- running --------------------\n"); + gunrock_cc(graph_o, &num_components, graph_i, config, data_t); + memcpy(components, (int*)graph_o->node_values, num_nodes * sizeof(int)); + + if (graph_i) free(graph_i); + if (graph_o) free(graph_o); + + printf("------------------- completed -------------------\n"); + return num_components; } // Leave this at the end of the file diff --git a/gunrock/app/mst/mst_app.cu b/gunrock/app/mst/mst_app.cu new file mode 100644 index 000000000..55e350471 --- /dev/null +++ b/gunrock/app/mst/mst_app.cu @@ -0,0 +1,177 @@ +// ---------------------------------------------------------------------------- +// Gunrock -- Fast and Efficient GPU Graph Library +// ---------------------------------------------------------------------------- +// This source code is distributed under the terms of LICENSE.TXT +// in the root directory of this source distribution. +// ---------------------------------------------------------------------------- + +/** + * @file mst_app.cu + * + * @brief minimum spanning tree (MST) application + */ + +#include + +// graph construction utilities +#include + +// primitive-specific includes +#include +#include +#include + +#include + +using namespace gunrock; +using namespace gunrock::util; +using namespace gunrock::oprtr; +using namespace gunrock::app::mst; + +/** + * @brief run minimum spanning tree + * + * @tparam VertexId + * @tparam Value + * @tparam SizeT + * + * @param[out] graph_o GRGraph type output graph + * @param[in] csr Reference to the CSR graph we process on + * @param[in] max_grid Maximum CTA occupancy + * @param[in] num_gpus Number of GPUs + * @param[in] context Modern GPU context + */ +template +void run_mst( + GRGraph *graph_o, + const Csr &csr, + const int max_grid, + const int num_gpus, + CudaContext &context) { + typedef MSTProblem Problem; // preparations + MSTEnactor enactor(false); // enactor map + VertexId *h_mst = new VertexId[csr.edges]; // results array + Problem *problem = new Problem; // problem on GPU + + util::GRError(problem->Init(false, csr, num_gpus), + "MST Data Initialization Failed", __FILE__, __LINE__); + + util::GRError(problem->Reset(enactor.GetFrontierType()), + "MST Data Reset Failed", __FILE__, __LINE__); + + util::GRError(enactor.template Enact(context, problem, max_grid), + "MST Enact Failed", __FILE__, __LINE__); + + util::GRError(problem->Extract(h_mst), + "MST Data Extraction Failed", __FILE__, __LINE__); + + graph_o->edge_values = (int*)&h_mst[0]; // output: 0|1 mask for all edges + + if (problem) { delete problem; } + + cudaDeviceSynchronize(); +} + +/** + * @brief dispatch function to handle data types + * + * @param[out] graph_o GRGraph type output graph + * @param[in] graph_i GRGraph type input graph + * @param[in] config MST-specific configurations + * @param[in] data_t Data type configurations + * @param[in] context Modern GPU context parameter + */ +void dispatch_mst( + GRGraph *graph_o, + const GRGraph *graph_i, + const GRSetup config, + const GRTypes data_t, + CudaContext &context) { + switch (data_t.VTXID_TYPE) { + case VTXID_INT: { + switch (data_t.SIZET_TYPE) { + case SIZET_INT: { + switch (data_t.VALUE_TYPE) { + case VALUE_INT: { // template type = + // create a CSR formatted graph + Csr csr(false); + csr.nodes = graph_i->num_nodes; + csr.edges = graph_i->num_edges; + csr.row_offsets = (int*)graph_i->row_offsets; + csr.column_indices = (int*)graph_i->col_indices; + csr.edge_values = (int*)graph_i->edge_values; + + // configurations if necessary + int num_gpus = 1; // number of GPU(s) to use + int max_grid = 0; // leave it up to the enactor + run_mst( + graph_o, csr, max_grid, num_gpus, context); + + // reset for free memory + csr.row_offsets = NULL; + csr.column_indices = NULL; + csr.edge_values = NULL; + break; + } + case VALUE_UINT: { // template type = + printf("Not Yet Support This DataType Combination.\n"); + break; + } + case VALUE_FLOAT: { // template type = + // create a CSR formatted graph + Csr csr(false); + csr.nodes = graph_i->num_nodes; + csr.edges = graph_i->num_edges; + csr.row_offsets = (int*)graph_i->row_offsets; + csr.column_indices = (int*)graph_i->col_indices; + csr.edge_values = (float*)graph_i->edge_values; + + // configurations if necessary + int num_gpus = 1; // number of GPU(s) to use + int max_grid = 0; // leave it up to the enactor + run_mst( + graph_o, csr, max_grid, num_gpus, context); + + // reset for free memory + csr.row_offsets = NULL; + csr.column_indices = NULL; + csr.edge_values = NULL; + break; + } + } + break; + } + } + break; + } + } +} + +/** + * @brief run_mst entry + * + * @tparam VertexId + * @tparam Value + * @tparam SizeT + * + * @param[out] graph_o GRGraph type output graph + * @param[in] graph_i GRGraph type input graph + * @param[in] config Primitive-specific configurations + * @param[in] data_t Data type configurations + */ +void gunrock_mst( + GRGraph *graph_o, + const GRGraph *graph_i, + const GRSetup config, + const GRTypes data_t) { + unsigned int device = 0; + device = config.device; + ContextPtr context = mgpu::CreateCudaDevice(device); + dispatch_mst(graph_o, graph_i, config, data_t, *context); +} + +// Leave this at the end of the file +// Local Variables: +// mode:c++ +// c-file-style: "NVIDIA" +// End: diff --git a/gunrock/app/mst/mst_enactor.cuh b/gunrock/app/mst/mst_enactor.cuh index 7d7d7a0a1..fa8dac343 100644 --- a/gunrock/app/mst/mst_enactor.cuh +++ b/gunrock/app/mst/mst_enactor.cuh @@ -215,10 +215,8 @@ public: typedef SuRmFunctor SuRmFunctor; typedef EIdxFunctor EIdxFunctor; typedef MarkFunctor MarkFunctor; - //typedef OrFunctor OrFunctor; cudaError_t retval = cudaSuccess; - unsigned int *d_scanned_edges = NULL; do @@ -297,9 +295,9 @@ public: problem->data_slices[0]->d_keys_array, problem->data_slices[0]->d_edge_weights, graph_slice->edges, - std::numeric_limits::max(), - mgpu::minimum(), - mgpu::equal_to(), + std::numeric_limits::max(), + mgpu::minimum(), + mgpu::equal_to(), problem->data_slices[0]->d_reduced_keys, problem->data_slices[0]->d_reduced_vals, &num_segments, (int*)0, context); @@ -341,8 +339,8 @@ public: util::MemsetKernel<<<128, 128>>>(problem->data_slices[0]->d_successors, std::numeric_limits::max(), graph_slice->nodes); util::MemsetKernel<<<128, 128>>>( - problem->data_slices[0]->d_temp_storage, - std::numeric_limits::max(), graph_slice->nodes); + problem->data_slices[0]->d_temp_index, + std::numeric_limits::max(), graph_slice->nodes); util::MemsetIdxKernel<<<128, 128>>>( graph_slice->frontier_queues.d_keys[frontier_attribute.selector], graph_slice->nodes); @@ -680,41 +678,41 @@ public: //////////////////////////////////////////////////////////////////////// // filter to remove all -1 in d_col_indices util::MemsetCopyVectorKernel<<<128, 128>>>( - problem->data_slices[0]->d_temp_storage, + problem->data_slices[0]->d_temp_index, problem->data_slices[0]->d_col_indices, graph_slice->edges); util::CUBSelect( - problem->data_slices[0]->d_temp_storage, graph_slice->edges, + problem->data_slices[0]->d_temp_index, graph_slice->edges, problem->data_slices[0]->d_col_indices, num_selected); //////////////////////////////////////////////////////////////////////// // filter to remove all -1 in d_edge_weights util::MemsetCopyVectorKernel<<<128, 128>>>( - problem->data_slices[0]->d_temp_storage, + problem->data_slices[0]->d_temp_value, problem->data_slices[0]->d_edge_weights, graph_slice->edges); - util::CUBSelect( - problem->data_slices[0]->d_temp_storage, graph_slice->edges, + util::CUBSelect( + problem->data_slices[0]->d_temp_value, graph_slice->edges, problem->data_slices[0]->d_edge_weights, num_selected); //////////////////////////////////////////////////////////////////////// // filter to remove all -1 in d_keys_array util::MemsetCopyVectorKernel<<<128, 128>>>( - problem->data_slices[0]->d_temp_storage, + problem->data_slices[0]->d_temp_index, problem->data_slices[0]->d_keys_array, graph_slice->edges); util::CUBSelect( - problem->data_slices[0]->d_temp_storage, graph_slice->edges, + problem->data_slices[0]->d_temp_index, graph_slice->edges, problem->data_slices[0]->d_keys_array, num_selected); //////////////////////////////////////////////////////////////////////// // filter to remove all -1 in d_origin_edges util::MemsetCopyVectorKernel<<<128, 128>>>( - problem->data_slices[0]->d_temp_storage, + problem->data_slices[0]->d_temp_index, problem->data_slices[0]->d_origin_edges, graph_slice->edges); util::CUBSelect( - problem->data_slices[0]->d_temp_storage, graph_slice->edges, + problem->data_slices[0]->d_temp_index, graph_slice->edges, problem->data_slices[0]->d_origin_edges, num_selected); if (DEBUG) printf(" * finished remove edges in one super-vertex.\n"); @@ -785,12 +783,12 @@ public: //////////////////////////////////////////////////////////////////////// // bring edges, weights, origin_eids together according to keys util::MemsetCopyVectorKernel<<<128, 128>>>( - problem->data_slices[0]->d_temp_storage, + problem->data_slices[0]->d_temp_index, problem->data_slices[0]->d_keys_array, graph_slice->edges); util::MemsetCopyVectorKernel<<<128, 128>>>( - problem->data_slices[0]->d_tmp_storage, + problem->data_slices[0]->d_super_edges, // used as temp_index problem->data_slices[0]->d_keys_array, graph_slice->edges); @@ -801,276 +799,15 @@ public: util::CUBRadixSort( true, graph_slice->edges, - problem->data_slices[0]->d_temp_storage, + problem->data_slices[0]->d_temp_index, problem->data_slices[0]->d_edge_weights); util::CUBRadixSort( true, graph_slice->edges, - problem->data_slices[0]->d_tmp_storage, + problem->data_slices[0]->d_super_edges, // used as temp_index problem->data_slices[0]->d_origin_edges); if (DEBUG) printf(" * finished sort according to new vertex ids.\n"); - - /* - //////////////////////////////////////////////////////////////////////// - // remove duplicated edges between super-vertices (optional operation) - if (false)//(enactor_stats.iteration == 0) - { - ////////////////////////////////////////////////////////////////////// - // generate edge flag array based on source vertices list [1] - // using MarkSegmentFromKeys on d_keys_array - util::MemsetKernel<<<128, 128>>>( - problem->data_slices[0]->d_flags_array, 0, graph_slice->edges); - util::MarkSegmentFromKeys<<<128, 128>>>( - problem->data_slices[0]->d_flags_array, - problem->data_slices[0]->d_keys_array, - graph_slice->edges); - - if (debug_info) - { - printf(":: mark segment to generate edge flag array [1] ::"); - util::DisplayDeviceResults( - problem->data_slices[0]->d_flags_array, graph_slice->edges); - } - - ////////////////////////////////////////////////////////////////////// - // generate edge flag array based on destination vertices list [2] - // create a flags array on the output of segmented sort based on the - // difference in u-v pair using MarkSegmentsFromKeys kernel function - util::MarkSegmentFromKeys<<<128, 128>>>( - problem->data_slices[0]->d_edge_flags, - problem->data_slices[0]->d_col_indices, - graph_slice->edges); - - if (debug_info) - { - printf(":: mark segment to generate edge flag array [2] ::"); - util::DisplayDeviceResults( - problem->data_slices[0]->d_edge_flags, graph_slice->edges); - } - - ////////////////////////////////////////////////////////////////////// - // do or operation for d_edge_flags and d_flags_array - u-v pair - frontier_attribute.queue_index = 0; - frontier_attribute.selector = 0; - frontier_attribute.queue_length = graph_slice->edges; - frontier_attribute.queue_reset = true; - - gunrock::oprtr::filter::Kernel - - <<>>( - enactor_stats.iteration + 1, - frontier_attribute.queue_reset, - frontier_attribute.queue_index, - enactor_stats.num_gpus, - frontier_attribute.queue_length, - NULL, - graph_slice->frontier_queues.d_values[frontier_attribute.selector], - NULL, - graph_slice->frontier_queues.d_values[frontier_attribute.selector^1], - data_slice, - NULL, - work_progress, - graph_slice->frontier_elements[frontier_attribute.selector], - graph_slice->frontier_elements[frontier_attribute.selector^1], - enactor_stats.filter_kernel_stats); - - if (DEBUG && (retval = util::GRError(cudaDeviceSynchronize(), - "filter::Kernel failed", __FILE__, __LINE__))) break; - - if (DEBUG) printf(" * finished edge flags - second edge removal.\n"); - - if (debug_info) - { - printf(":: duplicated edges between super-vertex d_edge_flags ::"); - util::DisplayDeviceResults( - problem->data_slices[0]->d_edge_flags, graph_slice->edges); - printf(":: edge removal u list (d_keys_array) ::"); - util::DisplayDeviceResults( - problem->data_slices[0]->d_keys_array, graph_slice->edges); - printf(":: edge removal v list (d_col_indices) ::"); - util::DisplayDeviceResults( - problem->data_slices[0]->d_col_indices, graph_slice->edges); - printf(":: edge removal w list (d_edge_weights) ::"); - util::DisplayDeviceResults( - problem->data_slices[0]->d_edge_weights, graph_slice->edges); - } - - ////////////////////////////////////////////////////////////////////// - // scan edge_flags to get edge_keys used for sorting - Scan( - (int*)problem->data_slices[0]->d_edge_flags, graph_slice->edges, - (int)0, mgpu::plus(), (int*)0, (int*)0, - (int*)problem->data_slices[0]->d_temp_storage, context); - - // set first bit of edge_flags back to 1 - util::MemsetKernel<<<1, 1>>>( - problem->data_slices[0]->d_edge_flags, 1, 1); - - ////////////////////////////////////////////////////////////////////// - // calculate the number of segments for edge_offsets - num_segments = Reduce( - problem->data_slices[0]->d_edge_flags, graph_slice->edges, context); - - ////////////////////////////////////////////////////////////////////// - // generate edge_offsets used for SegSortFromIndices - // edge_flags stored in d_row_offsets - frontier_attribute.queue_index = 0; - frontier_attribute.selector = 0; - frontier_attribute.queue_length = graph_slice->edges; - frontier_attribute.queue_reset = true; - - gunrock::oprtr::filter::Kernel - - <<>>( - enactor_stats.iteration + 1, - frontier_attribute.queue_reset, - frontier_attribute.queue_index, - enactor_stats.num_gpus, - frontier_attribute.queue_length, - NULL, - graph_slice->frontier_queues.d_values[frontier_attribute.selector], - NULL, - graph_slice->frontier_queues.d_values[frontier_attribute.selector^1], - data_slice, - NULL, - work_progress, - graph_slice->frontier_elements[frontier_attribute.selector], - graph_slice->frontier_elements[frontier_attribute.selector^1], - enactor_stats.filter_kernel_stats); - - if (DEBUG && (retval = util::GRError(cudaDeviceSynchronize(), - "filter::Kernel failed", __FILE__, __LINE__))) break; - - ////////////////////////////////////////////////////////////////////// - // segmented sort d_col_indices, d_edge_weights and d_origin_edges - // copy d_edge_weights to d_temp_storage to use for segmented sort - util::MemsetCopyVectorKernel<<<128, 128>>>( - problem->data_slices[0]->d_temp_storage, - problem->data_slices[0]->d_edge_weights, - graph_slice->edges); - - util::SegSortFromIndices( - context, - num_segments, - problem->data_slices[0]->d_row_offsets, - graph_slice->edges, - problem->data_slices[0]->d_edge_weights, - problem->data_slices[0]->d_col_indices); - - util::SegSortFromIndices( - context, - num_segments, - problem->data_slices[0]->d_row_offsets, - graph_slice->edges, - problem->data_slices[0]->d_temp_storage, - problem->data_slices[0]->d_origin_edges); - - if (DEBUG) printf(" * finished segmentedSort for edge reduction.\n"); - - if (debug_info) - { - printf(":: second reduction segmented sort d_col_indices ::"); - util::DisplayDeviceResults( - problem->data_slices[0]->d_col_indices, graph_slice->edges); - printf(":: second reduction segmented sort d_edge_weights ::"); - util::DisplayDeviceResults( - problem->data_slices[0]->d_edge_weights, graph_slice->edges); - printf(":: second reduction segmented sort d_origin_edges ::"); - util::DisplayDeviceResults( - problem->data_slices[0]->d_origin_edges, graph_slice->edges); - } - - ////////////////////////////////////////////////////////////////////// - // mark -1 to edges that needed to be removed using advance kernel - frontier_attribute.queue_index = 0; - frontier_attribute.selector = 0; - frontier_attribute.queue_length = graph_slice->edges; - frontier_attribute.queue_reset = true; - - gunrock::oprtr::filter::Kernel - - <<>>( - enactor_stats.iteration + 1, - frontier_attribute.queue_reset, - frontier_attribute.queue_index, - enactor_stats.num_gpus, - frontier_attribute.queue_length, - NULL, - graph_slice->frontier_queues.d_values[frontier_attribute.selector], - NULL, - graph_slice->frontier_queues.d_values[frontier_attribute.selector^1], - data_slice, - NULL, - work_progress, - graph_slice->frontier_elements[frontier_attribute.selector], - graph_slice->frontier_elements[frontier_attribute.selector^1], - enactor_stats.filter_kernel_stats); - - if (DEBUG && (retval = util::GRError(cudaDeviceSynchronize(), - "filter::Kernel failed", __FILE__, __LINE__))) break; - - if (DEBUG) printf(" * finished mark -1 for duplicated edges.\n"); - - ////////////////////////////////////////////////////////////////////// - // filter to remove all -1 in d_col_indices - util::MemsetCopyVectorKernel<<<128, 128>>>( - problem->data_slices[0]->d_temp_storage, - problem->data_slices[0]->d_col_indices, - graph_slice->edges); - util::CUBSelect( - problem->data_slices[0]->d_temp_storage, - graph_slice->edges, - problem->data_slices[0]->d_col_indices, - num_selected); - - ////////////////////////////////////////////////////////////////////// - // filter to remove all -1 in d_edge_weights - util::MemsetCopyVectorKernel<<<128, 128>>>( - problem->data_slices[0]->d_temp_storage, - problem->data_slices[0]->d_edge_weights, - graph_slice->edges); - util::CUBSelect( - problem->data_slices[0]->d_temp_storage, - graph_slice->edges, - problem->data_slices[0]->d_edge_weights, - num_selected); - - ////////////////////////////////////////////////////////////////////// - // filter to remove all -1 in d_keys_array - util::MemsetCopyVectorKernel<<<128, 128>>>( - problem->data_slices[0]->d_temp_storage, - problem->data_slices[0]->d_keys_array, - graph_slice->edges); - util::CUBSelect( - problem->data_slices[0]->d_temp_storage, - graph_slice->edges, - problem->data_slices[0]->d_keys_array, - num_selected); - - ////////////////////////////////////////////////////////////////////// - // filter to remove all -1 in d_origin_edges - util::MemsetCopyVectorKernel<<<128, 128>>>( - problem->data_slices[0]->d_temp_storage, - problem->data_slices[0]->d_origin_edges, - graph_slice->edges); - util::CUBSelect( - problem->data_slices[0]->d_temp_storage, - graph_slice->edges, - problem->data_slices[0]->d_origin_edges, - num_selected); - - if (DEBUG) - printf(" * finished remove edges between super-vertices.\n"); - - graph_slice->edges = *num_selected; - - if (DEBUG) - printf(" * finished update #edges: %d [2]\n", graph_slice->edges); - - } // end of removing duplicated edges between super-vertices - */ - if (DEBUG) printf(" (d). Constructing the Vertex List.\n"); //////////////////////////////////////////////////////////////////////// diff --git a/gunrock/app/mst/mst_functor.cuh b/gunrock/app/mst/mst_functor.cuh index 8cf90fe0e..7f38e5d31 100644 --- a/gunrock/app/mst/mst_functor.cuh +++ b/gunrock/app/mst/mst_functor.cuh @@ -111,7 +111,8 @@ struct EdgeFunctor VertexId s_id, VertexId d_id, DataSlice *problem, VertexId e_id = 0, VertexId e_id_in = 0) { - return problem->d_successors[s_id] == d_id; + return problem->d_successors[s_id] == d_id && + problem->d_reduced_vals[s_id] == problem->d_edge_weights[e_id]; } /** @@ -128,7 +129,7 @@ struct EdgeFunctor VertexId e_id = 0, VertexId e_id_in = 0) { util::io::ModifiedStore::St( - problem->d_origin_edges[e_id], problem->d_temp_storage + s_id); + problem->d_origin_edges[e_id], problem->d_temp_index + s_id); } }; @@ -184,7 +185,7 @@ struct MarkFunctor { // mark minimum spanning tree output edges util::io::ModifiedStore::St( - 1, problem->d_mst_output + problem->d_temp_storage[s_id]); + 1, problem->d_mst_output + problem->d_temp_index[s_id]); } }; @@ -246,7 +247,7 @@ struct CyRmFunctor // remove some edges in the MST output result util::io::ModifiedStore::St( - 0, problem->d_mst_output + problem->d_temp_storage[s_id]); + 0, problem->d_mst_output + problem->d_temp_index[s_id]); } }; @@ -363,13 +364,14 @@ struct EgRmFunctor VertexId e_id = 0, VertexId e_id_in = 0) { util::io::ModifiedStore::St( - -1, problem->d_keys_array + e_id); + (VertexId)-1, problem->d_keys_array + e_id); util::io::ModifiedStore::St( - -1, problem->d_col_indices + e_id); + (VertexId)-1, problem->d_col_indices + e_id); + //util::io::ModifiedStore::St( + // (Value)-1, problem->d_edge_weights + e_id); + problem->d_edge_weights[e_id] = (Value) -1; util::io::ModifiedStore::St( - -1, problem->d_edge_weights + e_id); - util::io::ModifiedStore::St( - -1, problem->d_origin_edges + e_id); + (VertexId)-1, problem->d_origin_edges + e_id); } /** @@ -505,7 +507,7 @@ struct EIdxFunctor VertexId node, DataSlice *problem, Value v = 0, SizeT nid=0) { util::io::ModifiedStore::St( - node, problem->d_row_offsets + problem->d_temp_storage[node]); + node, problem->d_row_offsets + problem->d_temp_index[node]); } }; @@ -606,13 +608,13 @@ struct SuRmFunctor VertexId node, DataSlice *problem, Value v = 0, SizeT nid=0) { util::io::ModifiedStore::St( - -1, problem->d_keys_array + node); + (VertexId)-1, problem->d_keys_array + node); util::io::ModifiedStore::St( - -1, problem->d_col_indices + node); + (VertexId)-1, problem->d_col_indices + node); util::io::ModifiedStore::St( - -1, problem->d_edge_weights + node); + (Value) -1, problem->d_edge_weights + node); util::io::ModifiedStore::St( - -1, problem->d_origin_edges + node); + (VertexId)-1, problem->d_origin_edges + node); } }; diff --git a/gunrock/app/mst/mst_problem.cuh b/gunrock/app/mst/mst_problem.cuh index b2b8e7f1f..9f4e3db6b 100644 --- a/gunrock/app/mst/mst_problem.cuh +++ b/gunrock/app/mst/mst_problem.cuh @@ -47,7 +47,7 @@ struct MSTProblem : ProblemBase<_VertexId, _SizeT, _USE_DOUBLE_BUFFER> typedef _SizeT SizeT; typedef _Value Value; - static const bool MARK_PREDECESSORS = true; + static const bool MARK_PREDECESSORS = true; static const bool ENABLE_IDEMPOTENCE = false; // helper structures @@ -71,10 +71,10 @@ struct MSTProblem : ProblemBase<_VertexId, _SizeT, _USE_DOUBLE_BUFFER> VertexId *d_origin_edges; // origin edge list keep track of e_ids VertexId *d_super_edges; // super edge list for next iteration VertexId *d_col_indices; // column indices of CSR graph (edges) + VertexId *d_temp_index; // used for storing temp index + Value *d_temp_value; // used for storing temp value Value *d_reduced_vals; // store reduced minimum weights Value *d_edge_weights; // store weights per edge - Value *d_temp_storage; // used for storing temp arrays - Value *d_tmp_storage; // used for storing temp arrays SizeT *d_supervtx_ids; // super vertex ids scanned from flags SizeT *d_row_offsets; // row offsets of CSR graph }; @@ -107,10 +107,7 @@ struct MSTProblem : ProblemBase<_VertexId, _SizeT, _USE_DOUBLE_BUFFER> * @brief MSTProblem default constructor */ - MSTProblem(): - nodes(0), - edges(0), - num_gpus(0) {} + MSTProblem(): nodes(0), edges(0), num_gpus(0) {} /** * @brief MSTProblem constructor @@ -153,9 +150,9 @@ struct MSTProblem : ProblemBase<_VertexId, _SizeT, _USE_DOUBLE_BUFFER> if (data_slices[i]->d_keys_array) util::GRError(cudaFree(data_slices[i]->d_keys_array), "GpuSlice cudaFree d_keys_array failed", __FILE__, __LINE__); - if (data_slices[i]->d_temp_storage) - util::GRError(cudaFree(data_slices[i]->d_temp_storage), - "GpuSlice cudaFree d_temp_storage failed", __FILE__, __LINE__); + if (data_slices[i]->d_temp_index) + util::GRError(cudaFree(data_slices[i]->d_temp_index), + "GpuSlice cudaFree d_temp_index failed", __FILE__, __LINE__); if (data_slices[i]->d_reduced_keys) util::GRError(cudaFree(data_slices[i]->d_reduced_keys), "GpuSlice cudaFree d_reduced_keys failed", __FILE__, __LINE__); @@ -183,9 +180,9 @@ struct MSTProblem : ProblemBase<_VertexId, _SizeT, _USE_DOUBLE_BUFFER> if (data_slices[i]->d_edge_flags) util::GRError(cudaFree(data_slices[i]->d_edge_flags), "GpuSlice cudaFree d_edge_flags failed", __FILE__, __LINE__); - if (data_slices[i]->d_tmp_storage) - util::GRError(cudaFree(data_slices[i]->d_tmp_storage), - "GpuSlice cudaFree d_tmp_storage failed", __FILE__, __LINE__); + if (data_slices[i]->d_temp_value) + util::GRError(cudaFree(data_slices[i]->d_temp_value), + "GpuSlice cudaFree d_temp_value failed", __FILE__, __LINE__); if (data_slices[i]->d_super_edges) util::GRError(cudaFree(data_slices[i]->d_super_edges), "GpuSlice cudaFree d_super_edges failed", __FILE__, __LINE__); @@ -348,7 +345,7 @@ struct MSTProblem : ProblemBase<_VertexId, _SizeT, _USE_DOUBLE_BUFFER> __FILE__, __LINE__)) return retval; data_slices[0]->d_reduced_vals = d_reduced_vals; util::MemsetKernel<<<128, 128>>>( - data_slices[0]->d_reduced_vals, 0, nodes); + data_slices[0]->d_reduced_vals, (Value)0, nodes); unsigned int *d_flags_array; if (retval = util::GRError(cudaMalloc( @@ -370,15 +367,15 @@ struct MSTProblem : ProblemBase<_VertexId, _SizeT, _USE_DOUBLE_BUFFER> util::MemsetKernel<<<128, 128>>>( data_slices[0]->d_keys_array, 0, edges); - SizeT *d_temp_storage; + VertexId *d_temp_index; if (retval = util::GRError(cudaMalloc( - (void**)&d_temp_storage, - edges * sizeof(SizeT)), - "MSTProblem cudaMalloc d_temp_storage Failed", + (void**)&d_temp_index, + edges * sizeof(VertexId)), + "MSTProblem cudaMalloc d_temp_index Failed", __FILE__, __LINE__)) return retval; - data_slices[0]->d_temp_storage = d_temp_storage; + data_slices[0]->d_temp_index = d_temp_index; util::MemsetKernel<<<128, 128>>>( - data_slices[0]->d_temp_storage, 0, edges); + data_slices[0]->d_temp_index, (VertexId)0, edges); VertexId *d_reduced_keys; if (retval = util::GRError(cudaMalloc( @@ -473,15 +470,15 @@ struct MSTProblem : ProblemBase<_VertexId, _SizeT, _USE_DOUBLE_BUFFER> util::MemsetKernel<<<128, 128>>>( data_slices[0]->d_edge_flags, 0, edges); - Value *d_tmp_storage; + Value *d_temp_value; if (retval = util::GRError(cudaMalloc( - (void**)&d_tmp_storage, + (void**)&d_temp_value, edges * sizeof(Value)), - "MSTProblem cudaMalloc d_tmp_storage Failed", + "MSTProblem cudaMalloc d_temp_value Failed", __FILE__, __LINE__)) return retval; - data_slices[0]->d_tmp_storage = d_tmp_storage; + data_slices[0]->d_temp_value = d_temp_value; util::MemsetKernel<<<128, 128>>>( - data_slices[0]->d_tmp_storage, 0, edges); + data_slices[0]->d_temp_value, (Value)0, edges); data_slices[0]->d_labels = NULL; } @@ -576,14 +573,14 @@ struct MSTProblem : ProblemBase<_VertexId, _SizeT, _USE_DOUBLE_BUFFER> data_slices[gpu]->d_keys_array = d_keys_array; } - if (!data_slices[gpu]->d_temp_storage) + if (!data_slices[gpu]->d_temp_index) { - SizeT *d_temp_storage; + VertexId *d_temp_index; if (retval = util::GRError(cudaMalloc( - (void**)&d_temp_storage, edges * sizeof(SizeT)), - "MSTProblem cudaMalloc d_temp_storage Failed", + (void**)&d_temp_index, edges * sizeof(VertexId)), + "MSTProblem cudaMalloc d_temp_index Failed", __FILE__, __LINE__)) return retval; - data_slices[gpu]->d_temp_storage = d_temp_storage; + data_slices[gpu]->d_temp_index = d_temp_index; } if (!data_slices[gpu]->d_successors) @@ -685,14 +682,14 @@ struct MSTProblem : ProblemBase<_VertexId, _SizeT, _USE_DOUBLE_BUFFER> data_slices[gpu]->d_edge_flags = d_edge_flags; } - if (!data_slices[gpu]->d_tmp_storage) + if (!data_slices[gpu]->d_temp_value) { - Value *d_tmp_storage; + Value *d_temp_value; if (retval = util::GRError(cudaMalloc( - (void**)&d_tmp_storage, edges * sizeof(Value)), - "MSTProblem cudaMalloc d_tmp_storage Failed", + (void**)&d_temp_value, edges * sizeof(Value)), + "MSTProblem cudaMalloc d_temp_value Failed", __FILE__, __LINE__)) return retval; - data_slices[gpu]->d_tmp_storage = d_tmp_storage; + data_slices[gpu]->d_temp_value = d_temp_value; } data_slices[0]->d_labels = NULL; @@ -727,4 +724,4 @@ struct MSTProblem : ProblemBase<_VertexId, _SizeT, _USE_DOUBLE_BUFFER> // Local Variables: // mode:c++ // c-file-style: "NVIDIA" -// End: \ No newline at end of file +// End: diff --git a/gunrock/app/pr/pr_app.cu b/gunrock/app/pr/pr_app.cu index 8a7200595..2d6d2c376 100644 --- a/gunrock/app/pr/pr_app.cu +++ b/gunrock/app/pr/pr_app.cu @@ -8,21 +8,19 @@ /** * @file pr_app.cu * - * @brief Gunrock PageRank Implementation + * @brief Gunrock PageRank application */ -#include #include -// Graph construction utils +// graph construction utilities #include -// Page Rank includes +// page-rank includes #include #include #include -// Moderngpu include #include using namespace gunrock; @@ -31,168 +29,118 @@ using namespace gunrock::oprtr; using namespace gunrock::app::pr; /** - * @brief run page rank + * @brief run page-rank * * @tparam VertexId * @tparam Value * @tparam SizeT * - * @param[out] ggraph_out Pointer to output CSR graph + * @param[out] graph_o Pointer to output CSR graph * @param[out] node_ids Pointer to output node IDs * @param[out] page_rank Pointer to output PageRanks - * @param[in] graph Reference to the CSR graph we process on - * @param[in] source Source ID for personalized PageRank (-1 for general PageRank) - * @param[in] delta Delta value for computing Page Rank, usually set to .85 + * @param[in] csr Reference to the CSR graph we process on + * @param[in] source Source ID for personalized PR (-1 for general PageRank) + * @param[in] delta Delta value for computing PageRank, usually set to 0.85 * @param[in] error Error threshold value * @param[in] max_iter Max iteration for Page Rank computing * @param[in] max_grid_size Maximum CTA occupancy * @param[in] num_gpus Number of GPUs * @param[in] context CudaContext for moderngpu to use */ -template < - typename VertexId, - typename Value, - typename SizeT > -void run_page_rank( - GunrockGraph *ggraph_out, +template + void run_pagerank( + GRGraph *graph_o, VertexId *node_ids, - Value *page_rank, - const Csr &graph, - const VertexId source, + Value *pagerank, + const Csr &csr, const Value delta, const Value error, const SizeT max_iter, const int max_grid_size, const int num_gpus, CudaContext& context) { - typedef PRProblem < - VertexId, - SizeT, - Value > Problem; - - // Allocate host-side label array for gpu-computed results - //Value *h_rank = (Value*)malloc(sizeof(Value) * graph.nodes); - //VertexId *h_node_id = (VertexId*)malloc(sizeof(VertexId) * graph.nodes); - - // Allocate Page Rank enactor map - PREnactor pr_enactor(false); - - // Allocate problem on GPU - Problem *csr_problem = new Problem; - util::GRError(csr_problem->Init( - false, - graph, - num_gpus), - "PageRank Problem Initialization Failed", __FILE__, __LINE__); - - // Perform PageRank - GpuTimer gpu_timer; - - util::GRError(csr_problem->Reset( - source, delta, error, pr_enactor.GetFrontierType()), - "PageRank Problem Data Reset Failed", __FILE__, __LINE__); - gpu_timer.Start(); - util::GRError(pr_enactor.template Enact( - context, csr_problem, max_iter, max_grid_size), - "PageRank Problem Enact Failed", __FILE__, __LINE__); - gpu_timer.Stop(); - - float elapsed = gpu_timer.ElapsedMillis(); - - // Copy out results - util::GRError(csr_problem->Extract(page_rank, node_ids), - "PageRank Problem Data Extraction Failed", - __FILE__, __LINE__); - - // Cleanup - if (csr_problem) delete csr_problem; - //if (h_node_id) free(h_node_id); - //if (h_rank) free(h_rank); + typedef PRProblem Problem; + PREnactor enactor(false); // PageRank enactor map + Problem *problem = new Problem; // Allocate problem on GPU + util::GRError(problem->Init(false, csr, num_gpus), + "PR Problem Initialization Failed", __FILE__, __LINE__); + + util::GRError(problem->Reset(0, delta, error, enactor.GetFrontierType()), + "PR Problem Data Reset Failed", __FILE__, __LINE__); + + GpuTimer gpu_timer; float elapsed = 0.0f; gpu_timer.Start(); // start + + util::GRError(enactor.template Enact( + context, problem, max_iter, max_grid_size), + "PR Problem Enact Failed", __FILE__, __LINE__); + + gpu_timer.Stop(); elapsed = gpu_timer.ElapsedMillis(); // elapsed time + printf(" device elapsed time: %.4f ms\n", elapsed); + + util::GRError(problem->Extract(pagerank, node_ids), + "PR Problem Extraction Failed", __FILE__, __LINE__); + + if (problem) delete problem; cudaDeviceSynchronize(); } /** * @brief dispatch function to handle data_types * - * @param[out] ggraph_out output of pr problem + * @param[out] graph_o output of pr problem * @param[out] node_ids output of pr problem * @param[out] page_rank output of pr problem - * @param[in] ggraph_in GunrockGraph type input graph - * @param[in] pr_config pr specific configurations - * @param[in] data_type data type configurations + * @param[in] graph_i GRGraph type input graph + * @param[in] config specific configurations + * @param[in] data_t data type configurations * @param[in] context moderngpu context */ -void dispatch_page_rank( - GunrockGraph *ggraph_out, - void *node_ids, - void *page_rank, - const GunrockGraph *ggraph_in, - const GunrockConfig pr_config, - const GunrockDataType data_type, - CudaContext& context) { - switch (data_type.VTXID_TYPE) { +void dispatch_pagerank( + GRGraph *graph_o, + void *node_ids, + void *pagerank, + const GRGraph *graph_i, + const GRSetup config, + const GRTypes data_t, + CudaContext &context) { + switch (data_t.VTXID_TYPE) { case VTXID_INT: { - switch (data_type.SIZET_TYPE) { + switch (data_t.SIZET_TYPE) { case SIZET_INT: { - switch (data_type.VALUE_TYPE) { - case VALUE_INT: { - // template type = + switch (data_t.VALUE_TYPE) { + case VALUE_INT: { // template type = printf("Not Yet Support This DataType Combination.\n"); break; } - case VALUE_UINT: { - // template type = + case VALUE_UINT: { // template type = printf("Not Yet Support This DataType Combination.\n"); break; } - case VALUE_FLOAT: { - // template type = + case VALUE_FLOAT: { // template type = // build input csr format graph Csr csr_graph(false); - csr_graph.nodes = ggraph_in->num_nodes; - csr_graph.edges = ggraph_in->num_edges; - csr_graph.row_offsets = (int*)ggraph_in->row_offsets; - csr_graph.column_indices = (int*)ggraph_in->col_indices; - - // page rank configurations - float delta = 0.85f; //!< default delta value - float error = 0.01f; //!< error threshold - int max_iter = 20; //!< maximum number of iterations - int max_grid_size = 0; //!< 0: leave it up to the enactor - int num_gpus = 1; //!< for multi-gpu enactor to use - int src_node = -1; //!< source node to start - - // determine source vertex to start sssp - switch (pr_config.src_mode) { - case randomize: { - src_node = graphio::RandomNode(csr_graph.nodes); - break; - } - case largest_degree: { - int max_node = 0; - src_node = csr_graph.GetNodeWithHighestDegree(max_node); - break; - } - case manually: { - src_node = pr_config.src_node; - break; - } - default: { - src_node = -1; - break; - } - } - delta = pr_config.delta; - error = pr_config.error; - max_iter = pr_config.max_iter; - - run_page_rank( - ggraph_out, + csr_graph.nodes = graph_i->num_nodes; + csr_graph.edges = graph_i->num_edges; + csr_graph.row_offsets = (int*)graph_i->row_offsets; + csr_graph.column_indices = (int*)graph_i->col_indices; + + // pagerank configurations + float delta = 0.85f; // default delta value + float error = 0.01f; // error threshold + int max_iter = 20; // maximum number of iterations + int max_grid_size = 0; // 0: leave it up to the enactor + int num_gpus = 1; // for multi-gpu enactor to use + + delta = config.delta; + error = config.error; + max_iter = config.max_iter; + + run_pagerank( + graph_o, (int*)node_ids, - (float*)page_rank, + (float*)pagerank, csr_graph, - src_node, delta, error, max_iter, @@ -215,37 +163,75 @@ void dispatch_page_rank( } /** - * @brief run_page_rank entry + * @brief run_pr entry * - * @param[out] ggraph_out output of pr problem + * @param[out] graph_o output of pr problem * @param[out] node_ids output of pr problem * @param[out] page_rank output of pr problem - * @param[in] ggraph_in input graph need to process on - * @param[in] pr_config gunrock primitive specific configurations - * @param[in] data_type gunrock datatype struct + * @param[in] graph_i input graph need to process on + * @param[in] config gunrock primitive specific configurations + * @param[in] data_t gunrock data_t struct */ -void gunrock_pr_func( - GunrockGraph *ggraph_out, - void *node_ids, - void *page_rank, - const GunrockGraph *ggraph_in, - const GunrockConfig pr_config, - const GunrockDataType data_type) { - - // moderngpu preparations - int device = 0; - device = pr_config.device; +void gunrock_pagerank( + GRGraph *graph_o, + void *node_ids, + void *pagerank, + const GRGraph *graph_i, + const GRSetup config, + const GRTypes data_t) { + unsigned int device = 0; + device = config.device; ContextPtr context = mgpu::CreateCudaDevice(device); + dispatch_pagerank( + graph_o, node_ids, pagerank, graph_i, config, data_t, *context); +} - // luanch dispatch function - dispatch_page_rank( - ggraph_out, - node_ids, - page_rank, - ggraph_in, - pr_config, - data_type, - *context); +/* + * @brief Simple interface take in CSR arrays as input + * @param[out] pagerank Return PageRank scores per node + * @param[in] num_nodes Number of nodes of the input graph + * @param[in] num_edges Number of edges of the input graph + * @param[in] row_offsets CSR-formatted graph input row offsets + * @param[in] col_indices CSR-formatted graph input column indices + * @param[in] source Source to begin traverse + */ +void pagerank( + int* node_ids, + float* pagerank, + const int num_nodes, + const int num_edges, + const int* row_offsets, + const int* col_indices) { + printf("-------------------- setting --------------------\n"); + + struct GRTypes data_t; // primitive-specific data types + data_t.VTXID_TYPE = VTXID_INT; // integer + data_t.SIZET_TYPE = SIZET_INT; // integer + data_t.VALUE_TYPE = VALUE_FLOAT; // float ranks + + struct GRSetup config; // primitive-specific configures + config.device = 0; // setting device to run + config.delta = 0.85f; // default delta value + config.error = 0.01f; // default error threshold + config.max_iter = 20; // maximum number of iterations + + struct GRGraph *graph_o = (struct GRGraph*)malloc(sizeof(struct GRGraph)); + struct GRGraph *graph_i = (struct GRGraph*)malloc(sizeof(struct GRGraph)); + + graph_i->num_nodes = num_nodes; + graph_i->num_edges = num_edges; + graph_i->row_offsets = (void*)&row_offsets[0]; + graph_i->col_indices = (void*)&col_indices[0]; + + printf(" loaded %d nodes and %d edges\n", num_nodes, num_edges); + + printf("-------------------- running --------------------\n"); + gunrock_pagerank(graph_o, node_ids, pagerank, graph_i, config, data_t); + + if (graph_i) free(graph_i); + if (graph_o) free(graph_o); + + printf("------------------- completed -------------------\n"); } // Leave this at the end of the file diff --git a/gunrock/app/pr/pr_enactor.cuh b/gunrock/app/pr/pr_enactor.cuh index 40b0d4ec7..fafe7ff15 100644 --- a/gunrock/app/pr/pr_enactor.cuh +++ b/gunrock/app/pr/pr_enactor.cuh @@ -6,11 +6,11 @@ // --------------------------------------------------------------------------- /** - * @file - * pr_enactor.cuh - * - * @brief PR Problem Enactor - */ +* @file +* pr_enactor.cuh +* +* @brief PR Problem Enactor +*/ #pragma once @@ -32,358 +32,666 @@ namespace gunrock { namespace app { namespace pr { - template class Enactor; +template class Enactor; - template - __global__ void Print_Const ( - const DataSlice* const data_slice) +template +__global__ void Print_Const ( + const DataSlice* const data_slice) +{ + printf("delta = %f, threshold = %f, src_node = %d\n", + data_slice->delta, data_slice->threshold, data_slice->src_node); +} + +template < + typename VertexId, + typename SizeT> +__global__ void Mark_Queue_R0D ( + const SizeT num_elements, + const VertexId* const keys_in, + const SizeT* const degrees, + SizeT* marker) +{ + const SizeT STRIDE = gridDim.x * blockDim.x; + VertexId x = blockIdx.x * blockDim.x + threadIdx.x; + + while ( x < num_elements) { - printf("delta = %f, threshold = %f, src_node = %d\n", - data_slice->delta, data_slice->threshold, data_slice->src_node); + VertexId key = keys_in[x]; + //if (degrees[key] == 0) printf("d[%d @ %d]==0 \t", key, x); + marker[x] = degrees[key]==0? 1 :0; + x += STRIDE; } +} - template < - typename VertexId, - typename SizeT> - __global__ void Mark_Queue_R0D ( - const SizeT num_elements, - const VertexId* const keys_in, - const SizeT* const degrees, - SizeT* marker) - { - const SizeT STRIDE = gridDim.x * blockDim.x; - VertexId x = blockIdx.x * blockDim.x + threadIdx.x; +template < + typename VertexId, + typename SizeT> +__global__ void Make_Queue_R0D ( + const SizeT num_elements, + const VertexId* const keys_in, + const SizeT* const marker, + VertexId* keys_out) +{ + const SizeT STRIDE = gridDim.x * blockDim.x; + VertexId x = blockIdx.x * blockDim.x + threadIdx.x; - while ( x < num_elements) + while (x < num_elements) + { + SizeT Mx = marker[x]; + if ((x!=0 && marker[x-1]!=Mx) + ||(x==0 && Mx==1)) { - VertexId key = keys_in[x]; - //if (degrees[key] == 0) printf("d[%d @ %d]==0 \t", key, x); - marker[x] = degrees[key]==0? 1 :0; - x += STRIDE; + keys_out[Mx-1] = keys_in[x]; } + x += STRIDE; } +} - template < - typename VertexId, - typename SizeT> - __global__ void Make_Queue_R0D ( - const SizeT num_elements, - const VertexId* const keys_in, - const SizeT* const marker, - VertexId* keys_out) +template < + typename VertexId, + typename SizeT, + typename Value, + int NUM_VERTEX_ASSOCIATES, + int NUM_VALUE__ASSOCIATES> +__global__ void Expand_Incoming_R0D ( + const SizeT num_elements, + const VertexId* const keys_in, + SizeT* degrees) +{ + const SizeT STRIDE = gridDim.x * blockDim.x; + VertexId x = blockIdx.x * blockDim.x + threadIdx.x; + while (x < num_elements) { - const SizeT STRIDE = gridDim.x * blockDim.x; - VertexId x = blockIdx.x * blockDim.x + threadIdx.x; - - while (x < num_elements) - { - SizeT Mx = marker[x]; - if ((x!=0 && marker[x-1]!=Mx) - ||(x==0 && Mx==1)) - { - keys_out[Mx-1] = keys_in[x]; - } - x += STRIDE; - } + VertexId key = keys_in[x]; + degrees[key] = 0; + x += STRIDE; } +} - template < - typename VertexId, - typename SizeT, - typename Value, - int NUM_VERTEX_ASSOCIATES, - int NUM_VALUE__ASSOCIATES> - __global__ void Expand_Incoming_R0D ( - const SizeT num_elements, - const VertexId* const keys_in, - SizeT* degrees) +template < + typename VertexId, + typename SizeT> +__global__ void Clear_Zero_R0D ( + const SizeT num_elements, + const SizeT* const degrees_curr, + SizeT* degrees_next) +{ + const SizeT STRIDE = gridDim.x * blockDim.x; + VertexId x = blockIdx.x * blockDim.x + threadIdx.x; + while (x < num_elements) { - const SizeT STRIDE = gridDim.x * blockDim.x; - VertexId x = blockIdx.x * blockDim.x + threadIdx.x; - while (x < num_elements) - { - VertexId key = keys_in[x]; - degrees[key] = 0; - x += STRIDE; - } + if (degrees_curr[x] == 0) + degrees_next[x] = -1; + x += STRIDE; } +} - template < - typename VertexId, - typename SizeT> - __global__ void Clear_Zero_R0D ( - const SizeT num_elements, - const SizeT* const degrees_curr, - SizeT* degrees_next) +template < + typename VertexId, + typename SizeT, + typename Value, + int NUM_VERTEX_ASSOCIATES, + int NUM_VALUE__ASSOCIATES> +__global__ void Expand_Incoming_PR ( + const SizeT num_elements, + const VertexId* const keys_in, + const size_t array_size, + char* array) +{ + extern __shared__ char s_array[]; + const SizeT STRIDE = gridDim.x * blockDim.x; + size_t offset = 0; + offset += sizeof(VertexId*) * NUM_VERTEX_ASSOCIATES; + Value** s_value__associate_in = (Value**)&(s_array[offset]); + offset += sizeof(Value* ) * NUM_VALUE__ASSOCIATES; + offset += sizeof(VertexId*) * NUM_VERTEX_ASSOCIATES; + Value** s_value__associate_org = (Value**)&(s_array[offset]); + SizeT x = threadIdx.x; + while (x < array_size) { - const SizeT STRIDE = gridDim.x * blockDim.x; - VertexId x = blockIdx.x * blockDim.x + threadIdx.x; - while (x < num_elements) - { - if (degrees_curr[x] == 0) - degrees_next[x] = -1; - x += STRIDE; - } + s_array[x] = array[x]; + x += blockDim.x; } + __syncthreads(); - template < - typename VertexId, - typename SizeT, - typename Value, - int NUM_VERTEX_ASSOCIATES, - int NUM_VALUE__ASSOCIATES> - __global__ void Expand_Incoming_PR ( - const SizeT num_elements, - const VertexId* const keys_in, - const size_t array_size, - char* array) + x = blockIdx.x * blockDim.x + threadIdx.x; + while (x < num_elements) { - extern __shared__ char s_array[]; - const SizeT STRIDE = gridDim.x * blockDim.x; - size_t offset = 0; - offset += sizeof(VertexId*) * NUM_VERTEX_ASSOCIATES; - Value** s_value__associate_in = (Value**)&(s_array[offset]); - offset += sizeof(Value* ) * NUM_VALUE__ASSOCIATES; - offset += sizeof(VertexId*) * NUM_VERTEX_ASSOCIATES; - Value** s_value__associate_org = (Value**)&(s_array[offset]); - SizeT x = threadIdx.x; - while (x < array_size) - { - s_array[x] = array[x]; - x += blockDim.x; - } - __syncthreads(); - - x = blockIdx.x * blockDim.x + threadIdx.x; - while (x < num_elements) - { - VertexId key = keys_in[x]; - Value old_value=atomicAdd(s_value__associate_org[0] + key, s_value__associate_in[0][x]); - if (TO_TRACK) - if (to_track(key)) printf("rank[%d] = %f + %f \n", key, old_value, s_value__associate_in[0][x]); - x+=STRIDE; - } + VertexId key = keys_in[x]; + Value old_value=atomicAdd(s_value__associate_org[0] + key, s_value__associate_in[0][x]); + if (TO_TRACK) + if (to_track(key)) printf("rank[%d] = %f + %f \n", key, old_value, s_value__associate_in[0][x]); + x+=STRIDE; } +} - template < - typename VertexId, - typename SizeT> - __global__ void Assign_Marker_PR( - const SizeT num_elements, - const int num_gpus, - const SizeT* markers, - const int* partition_table, - SizeT** key_markers) +template < + typename VertexId, + typename SizeT> +__global__ void Assign_Marker_PR( + const SizeT num_elements, + const int num_gpus, + const SizeT* markers, + const int* partition_table, + SizeT** key_markers) +{ + extern __shared__ SizeT* s_marker[]; + int gpu = 0; + SizeT x = blockIdx.x * blockDim.x + threadIdx.x; + const SizeT STRIDE = gridDim.x * blockDim.x; + if (threadIdx.x < num_gpus) + s_marker[threadIdx.x] = key_markers[threadIdx.x]; + __syncthreads(); + + while (x < num_elements) { - extern __shared__ SizeT* s_marker[]; - int gpu = 0; - SizeT x = blockIdx.x * blockDim.x + threadIdx.x; - const SizeT STRIDE = gridDim.x * blockDim.x; - if (threadIdx.x < num_gpus) - s_marker[threadIdx.x] = key_markers[threadIdx.x]; - __syncthreads(); - - while (x < num_elements) + //gpu = num_gpus; + gpu = partition_table[x]; + if (markers[x] != 1 && gpu != 0) { - //gpu = num_gpus; - gpu = partition_table[x]; - if (markers[x] != 1 && gpu != 0) - { - gpu = num_gpus; - } - for (int i=0; i - __global__ void Assign_Keys_PR ( - const SizeT num_elements, - const int num_gpus, - const int* partition_table, - const SizeT* markers, - SizeT** keys_markers, - VertexId** keys_outs) - { - const SizeT STRIDE = gridDim.x * blockDim.x; - SizeT x = blockIdx.x * blockDim.x + threadIdx.x; +template < + typename VertexId, + typename SizeT> +__global__ void Assign_Keys_PR ( + const SizeT num_elements, + const int num_gpus, + const int* partition_table, + const SizeT* markers, + SizeT** keys_markers, + VertexId** keys_outs) +{ + const SizeT STRIDE = gridDim.x * blockDim.x; + SizeT x = blockIdx.x * blockDim.x + threadIdx.x; - while (x < num_elements) + while (x < num_elements) + { + int gpu = partition_table[x]; + if (markers[x] == 1 || gpu == 0) { - int gpu = partition_table[x]; - if (markers[x] == 1 || gpu == 0) - { - //if (gpu > 0) - //{ - SizeT pos = keys_markers[gpu][x]-1; - //printf("keys_outs[%d][%d] <- %d \t", gpu, pos, x); - keys_outs[gpu][pos] = x; - //} - } - x+=STRIDE; + //if (gpu > 0) + //{ + SizeT pos = keys_markers[gpu][x]-1; + //printf("keys_outs[%d][%d] <- %d \t", gpu, pos, x); + keys_outs[gpu][pos] = x; + //} } + x+=STRIDE; } +} - template < - typename VertexId, - typename SizeT, - typename Value> - __global__ void Assign_Values_PR ( - const SizeT num_elements, - const VertexId* const keys_out, - const Value* const rank_next, - Value* rank_out) - { - const SizeT STRIDE = gridDim.x * blockDim.x; - SizeT x = blockIdx.x * blockDim.x + threadIdx.x; +template < + typename VertexId, + typename SizeT, + typename Value> +__global__ void Assign_Values_PR ( + const SizeT num_elements, + const VertexId* const keys_out, + const Value* const rank_next, + Value* rank_out) +{ + const SizeT STRIDE = gridDim.x * blockDim.x; + SizeT x = blockIdx.x * blockDim.x + threadIdx.x; - while (x < num_elements) - { - VertexId key = keys_out[x]; - rank_out[x] = rank_next[key]; - x+=STRIDE; - } + while (x < num_elements) + { + VertexId key = keys_out[x]; + rank_out[x] = rank_next[key]; + x+=STRIDE; } +} - template < - typename VertexId, - typename SizeT, - typename Value> - __global__ void Expand_Incoming_Final ( - const SizeT num_elements, - const VertexId* const keys_in, - const Value* const ranks_in, - Value* ranks_out) +template < + typename VertexId, + typename SizeT, + typename Value> +__global__ void Expand_Incoming_Final ( + const SizeT num_elements, + const VertexId* const keys_in, + const Value* const ranks_in, + Value* ranks_out) +{ + const SizeT STRIDE = gridDim.x * blockDim.x; + SizeT x = blockIdx.x * blockDim.x + threadIdx.x; + while (x < num_elements) { - const SizeT STRIDE = gridDim.x * blockDim.x; - SizeT x = blockIdx.x * blockDim.x + threadIdx.x; - while (x < num_elements) - { - VertexId key = keys_in[x]; - ranks_out[key] = ranks_in[x]; - x+=STRIDE; - } + VertexId key = keys_in[x]; + ranks_out[key] = ranks_in[x]; + x+=STRIDE; } +} template < - typename AdvanceKernelPolicy, - typename FilterKernelPolicy, - typename Enactor> +typename AdvanceKernelPolicy, +typename FilterKernelPolicy, +typename Enactor> struct R0DIteration : public IterationBase < - AdvanceKernelPolicy, FilterKernelPolicy, Enactor, - false, //HAS_SUBQ - true, //HAS_FULLQ - false, //BACKWARD - true, //FORWARD - false> //UPDATE_PREDECESSORS +AdvanceKernelPolicy, FilterKernelPolicy, Enactor, +false, //HAS_SUBQ +true, //HAS_FULLQ +false, //BACKWARD +true, //FORWARD +false> //UPDATE_PREDECESSORS { public: - typedef typename Enactor::SizeT SizeT ; - typedef typename Enactor::Value Value ; - typedef typename Enactor::VertexId VertexId ; - typedef typename Enactor::Problem Problem ; - typedef typename Problem::DataSlice DataSlice ; - typedef GraphSlice GraphSlice; - typedef RemoveZeroDegreeNodeFunctor< - VertexId, - SizeT, - Value, - Problem> RemoveZeroFunctor; - - static void FullQueue_Gather( - int thread_num, - int peer_, - util::DoubleBuffer - *frontier_queue, - util::Array1D *scanned_edges, - FrontierAttribute *frontier_attribute, - EnactorStats *enactor_stats, - DataSlice *data_slice, - DataSlice *d_data_slice, - GraphSlice *graph_slice, - util::CtaWorkProgressLifetime *work_progress, - ContextPtr context, - cudaStream_t stream) +typedef typename Enactor::SizeT SizeT ; +typedef typename Enactor::Value Value ; +typedef typename Enactor::VertexId VertexId ; +typedef typename Enactor::Problem Problem ; +typedef typename Problem::DataSlice DataSlice ; +typedef GraphSlice GraphSlice; +typedef RemoveZeroDegreeNodeFunctor< + VertexId, + SizeT, + Value, + Problem> RemoveZeroFunctor; + +static void FullQueue_Gather( + int thread_num, + int peer_, + util::DoubleBuffer + *frontier_queue, + util::Array1D *scanned_edges, + FrontierAttribute *frontier_attribute, + EnactorStats *enactor_stats, + DataSlice *data_slice, + DataSlice *d_data_slice, + GraphSlice *graph_slice, + util::CtaWorkProgressLifetime *work_progress, + ContextPtr context, + cudaStream_t stream) +{ + if (enactor_stats->iteration == 0) { - if (enactor_stats->iteration == 0) - { - frontier_attribute->queue_reset = true; - frontier_attribute->selector = 0; - frontier_attribute->queue_index = 0; - frontier_attribute->queue_length = data_slice->num_gpus>1 ? data_slice->local_nodes : graph_slice->nodes; - } + frontier_attribute->queue_reset = true; + frontier_attribute->selector = 0; + frontier_attribute->queue_index = 0; + frontier_attribute->queue_length = data_slice->num_gpus>1 ? data_slice->local_nodes : graph_slice->nodes; } +} + +static void FullQueue_Core( + int thread_num, + int peer_, + util::DoubleBuffer + *frontier_queue, + util::Array1D *scanned_edges, + FrontierAttribute *frontier_attribute, + EnactorStats *enactor_stats, + DataSlice *data_slice, + DataSlice *d_data_slice, + GraphSlice *graph_slice, + util::CtaWorkProgressLifetime *work_progress, + ContextPtr context, + cudaStream_t stream) +{ + //Print_Const<<<1,1,0,stream>>>(d_data_slice); + SizeT num_valid_node = frontier_attribute->queue_length; + + //util::DisplayDeviceResults(problem->graph_slices[0]->frontier_queues.d_keys[selector], + // num_elements); + //util::cpu_mt::PrintGPUArray("keys0", frontier_queue->keys[frontier_attribute->selector].GetPointer(util::DEVICE), frontier_attribute->queue_length, thread_num, enactor_stats->iteration, peer_, stream); + //util::cpu_mt::PrintGPUArray("degrees0", data_slice->degrees.GetPointer(util::DEVICE), graph_slice->nodes, thread_num, enactor_stats->iteration, peer_, stream); + + //bool over_sized = false; + //if (enactor_stats->retval = Check_Size( + // "scanned_edges", frontier_attribute->queue_length, scanned_edges, over_sized, thread_num, enactor_stats->iteration, peer_)) return; + //if (enactor_stats->retval = work_progress->SetQueueLength(frontier_attribute->queue_index, frontier_attribute->queue_length, false, stream)) return; + frontier_attribute->queue_reset = true; + gunrock::oprtr::advance::LaunchKernel( + enactor_stats[0], + frontier_attribute[0], + d_data_slice, + (VertexId*)NULL, + (bool* )NULL, + (bool* )NULL, + scanned_edges->GetPointer(util::DEVICE), + frontier_queue->keys[frontier_attribute->selector ].GetPointer(util::DEVICE),// d_in_queue + frontier_queue->keys[frontier_attribute->selector^1].GetPointer(util::DEVICE),// d_out_queue + (VertexId*)NULL, + (VertexId*)NULL, + graph_slice->row_offsets .GetPointer(util::DEVICE), + graph_slice->column_indices.GetPointer(util::DEVICE), + (SizeT* )NULL, + (VertexId*)NULL, + graph_slice->nodes, //graph_slice->frontier_elements[frontier_attribute.selector], // max_in_queue + graph_slice->edges, //graph_slice->frontier_elements[frontier_attribute.selector^1], // max_out_queue + work_progress[0], + context[0], + stream, + gunrock::oprtr::advance::V2V, + false, + false); + + //if (DEBUG && (retval = util::GRError(cudaThreadSynchronize(), + // "edge_map_forward::Kernel failed", __FILE__, __LINE__))) break; + enactor_stats -> Accumulate( + work_progress -> GetQueueLengthPointer(frontier_attribute->queue_index+1), stream); + + gunrock::oprtr::filter::Kernel + <<filter_grid_size, FilterKernelPolicy::THREADS, 0, stream>>>( + enactor_stats->iteration, + frontier_attribute->queue_reset, + frontier_attribute->queue_index, + frontier_attribute->queue_length, + frontier_queue->keys[frontier_attribute->selector ].GetPointer(util::DEVICE), // d_in_queue + NULL, + frontier_queue->keys[frontier_attribute->selector^1].GetPointer(util::DEVICE), // d_out_queue + d_data_slice, + NULL, + work_progress[0], + frontier_queue->keys[frontier_attribute->selector ].GetSize(), // max_in_queue + frontier_queue->keys[frontier_attribute->selector^1].GetSize(), // max_out_queue + enactor_stats->filter_kernel_stats); + + //if (DEBUG && (retval = util::GRError(cudaThreadSynchronize(), + // "filter::Kernel RemoveZeroFunctor failed", __FILE__, __LINE__))) + // break; + + Clear_Zero_R0D + <<<128, 128, 0, stream>>> ( + graph_slice->nodes, + data_slice -> degrees.GetPointer(util::DEVICE), + data_slice -> degrees_pong.GetPointer(util::DEVICE)); + + util::MemsetCopyVectorKernel<<<128, 128, 0, stream>>>( + data_slice->degrees.GetPointer(util::DEVICE), + data_slice->degrees_pong.GetPointer(util::DEVICE), graph_slice->nodes); + + //util::DisplayDeviceResults(problem->data_slices[0]->d_degrees, + // graph_slice->nodes); + + frontier_attribute->queue_index++; + frontier_attribute->selector^=1; + if (enactor_stats->retval = work_progress->GetQueueLength(frontier_attribute->queue_index, frontier_attribute->queue_length, false, stream)) return; + if (enactor_stats->retval = util::GRError(cudaStreamSynchronize(stream), "cudaStreamSynchronize failed", __FILE__, __LINE__)) return; + //enactor_stats->total_queued[0] += frontier_attribute->queue_length; + //util::cpu_mt::PrintGPUArray("keys1", frontier_queue->keys[frontier_attribute->selector].GetPointer(util::DEVICE), frontier_attribute->queue_length, thread_num, enactor_stats->iteration, peer_, stream); + //util::cpu_mt::PrintGPUArray("degrees1", data_slice->degrees.GetPointer(util::DEVICE), graph_slice->nodes, thread_num, enactor_stats->iteration, peer_, stream); + + if (num_valid_node == frontier_attribute->queue_length || num_valid_node==0) data_slice->to_continue = false; + else data_slice->to_continue = true; +} + +static cudaError_t Compute_OutputLength( + FrontierAttribute *frontier_attribute, + SizeT *d_offsets, + VertexId *d_indices, + VertexId *d_in_key_queue, + util::Array1D *partitioned_scanned_edges, + SizeT max_in, + SizeT max_out, + CudaContext &context, + cudaStream_t stream, + gunrock::oprtr::advance::TYPE ADVANCE_TYPE, + bool express = false) +{ + cudaError_t retval = cudaSuccess; + bool over_sized = false; + if (retval = Check_Size ( + "scanned_edges", frontier_attribute->queue_length, partitioned_scanned_edges, over_sized, -1, -1, -1, false)) return retval; + retval = gunrock::oprtr::advance::ComputeOutputLength + ( + frontier_attribute, + d_offsets, + d_indices, + d_in_key_queue, + partitioned_scanned_edges->GetPointer(util::DEVICE), + max_in, + max_out, + context, + stream, + ADVANCE_TYPE, + express); + return retval; +} + +template +static void Expand_Incoming( + int grid_size, + int block_size, + size_t shared_size, + cudaStream_t stream, + SizeT &num_elements, + const VertexId* const keys_in, + util::Array1D* keys_out, + const size_t array_size, + char* array, + DataSlice* data_slice) +{ + Expand_Incoming_R0D + + <<>> ( + num_elements, + keys_in, + data_slice->degrees.GetPointer(util::DEVICE)); + num_elements = 0; +} + +static bool Stop_Condition( + EnactorStats *enactor_stats, + FrontierAttribute *frontier_attribute, + util::Array1D *data_slice, + int num_gpus) +{ + //printf("CC Stop checked\n");fflush(stdout); + for (int gpu = 0; gpu < num_gpus*num_gpus; gpu++) + if (enactor_stats[gpu].retval != cudaSuccess) + { + printf("(CUDA error %d @ GPU %d: %s\n", enactor_stats[gpu].retval, gpu%num_gpus, cudaGetErrorString(enactor_stats[gpu].retval)); fflush(stdout); + return true; + } + + /*for (int gpu = 0; gpu< num_gpus*num_gpus; gpu++) + if (enactor_stats[gpu].iteration == 0) + { + printf("enactor_stats[%d].iteration ==0\n", gpu);fflush(stdout); + return false; + }*/ + + for (int gpu=0; gputo_continue && frontier_attribute[gpu*num_gpus].queue_length !=0) + { + //printf("data_slice[%d]->to_continue, frontier_attribute[%d].queue_length = %d\n", gpu, gpu*num_gpus, frontier_attribute[gpu*num_gpus].queue_length);fflush(stdout); + return false; + } - static void FullQueue_Core( - int thread_num, - int peer_, - util::DoubleBuffer - *frontier_queue, - util::Array1D *scanned_edges, - FrontierAttribute *frontier_attribute, - EnactorStats *enactor_stats, - DataSlice *data_slice, - DataSlice *d_data_slice, - GraphSlice *graph_slice, - util::CtaWorkProgressLifetime *work_progress, - ContextPtr context, - cudaStream_t stream) + for (int gpu=0; gpuin_length[i][peer]!=0) + { + //printf("data_slice[%d]->in_length[%d][%d] = %d\n", gpu, i, peer, data_slice[gpu]->in_length[i][peer]);fflush(stdout); + return false; + } + + for (int gpu=0; gpuout_length[peer]!=0) + { + //printf("data_slice[%d]->out_length[%d] = %d\n", gpu, peer, data_slice[gpu]->out_length[peer]); fflush(stdout); + return false; + } + //printf("CC to stop\n");fflush(stdout); + return true; +} + +template < + int NUM_VERTEX_ASSOCIATES, + int NUM_VALUE__ASSOCIATES> +static void Make_Output( + int thread_num, + SizeT num_elements, + int num_gpus, + util::DoubleBuffer + *frontier_queue, + util::Array1D *scanned_edges, + FrontierAttribute *frontier_attribute, + EnactorStats *enactor_stats, + util::Array1D + *data_slice, + GraphSlice *graph_slice, + util::CtaWorkProgressLifetime *work_progress, + ContextPtr context, + cudaStream_t stream) +{ + if (num_elements == 0) { - //Print_Const<<<1,1,0,stream>>>(d_data_slice); - SizeT num_valid_node = frontier_attribute->queue_length; - - //util::DisplayDeviceResults(problem->graph_slices[0]->frontier_queues.d_keys[selector], - // num_elements); - //util::cpu_mt::PrintGPUArray("keys0", frontier_queue->keys[frontier_attribute->selector].GetPointer(util::DEVICE), frontier_attribute->queue_length, thread_num, enactor_stats->iteration, peer_, stream); - //util::cpu_mt::PrintGPUArray("degrees0", data_slice->degrees.GetPointer(util::DEVICE), graph_slice->nodes, thread_num, enactor_stats->iteration, peer_, stream); - - //bool over_sized = false; - //if (enactor_stats->retval = Check_Size( - // "scanned_edges", frontier_attribute->queue_length, scanned_edges, over_sized, thread_num, enactor_stats->iteration, peer_)) return; - //if (enactor_stats->retval = work_progress->SetQueueLength(frontier_attribute->queue_index, frontier_attribute->queue_length, false, stream)) return; - frontier_attribute->queue_reset = true; - gunrock::oprtr::advance::LaunchKernel( - enactor_stats[0], - frontier_attribute[0], - d_data_slice, - (VertexId*)NULL, - (bool* )NULL, - (bool* )NULL, - scanned_edges->GetPointer(util::DEVICE), - frontier_queue->keys[frontier_attribute->selector ].GetPointer(util::DEVICE),// d_in_queue - frontier_queue->keys[frontier_attribute->selector^1].GetPointer(util::DEVICE),// d_out_queue - (VertexId*)NULL, - (VertexId*)NULL, - graph_slice->row_offsets .GetPointer(util::DEVICE), - graph_slice->column_indices.GetPointer(util::DEVICE), - (SizeT* )NULL, - (VertexId*)NULL, - graph_slice->nodes, //graph_slice->frontier_elements[frontier_attribute.selector], // max_in_queue - graph_slice->edges, //graph_slice->frontier_elements[frontier_attribute.selector^1], // max_out_queue - work_progress[0], - context[0], - stream, - gunrock::oprtr::advance::V2V, - false, - false); + for (int peer_ =0; peer_out_length[peer_] = 0; + return; + } + + int block_size = 256; + int grid_size = num_elements / block_size; + int peer_ = 0; + if ((num_elements % block_size)!=0) grid_size ++; + if (grid_size > 512) grid_size = 512; + + //util::MemsetKernel<<<128, 128, 0, stream>>>(data_slice[0]->markers.GetPointer(util::DEVICE), 0, num_elements); + Mark_Queue_R0D + <<>> ( + num_elements, + frontier_queue->keys[frontier_attribute->selector].GetPointer(util::DEVICE), + data_slice[0] -> degrees.GetPointer(util::DEVICE), + data_slice[0] -> markers.GetPointer(util::DEVICE)); + //util::cpu_mt::PrintGPUArray("markers", data_slice[0]->markers.GetPointer(util::DEVICE), num_elements, thread_num, enactor_stats->iteration, -1, stream); + + Scan( + (int*)data_slice[0] -> markers.GetPointer(util::DEVICE), + num_elements, + (int)0, mgpu::plus(), (int*)0, (int*)0, + (int*)data_slice[0] -> markers.GetPointer(util::DEVICE), + context[0]); + + Make_Queue_R0D + <<>> ( + num_elements, + frontier_queue->keys[frontier_attribute->selector].GetPointer(util::DEVICE), + data_slice[0]->markers.GetPointer(util::DEVICE), + data_slice[0]->keys_out[1].GetPointer(util::DEVICE)); + + if (!Enactor::SIZE_CHECK) + util::MemsetCopyVectorKernel <<>>( + data_slice[0]->frontier_queues[0].keys[frontier_attribute->selector].GetPointer(util::DEVICE), + frontier_queue->keys[frontier_attribute->selector].GetPointer(util::DEVICE), + num_elements); + + cudaMemcpyAsync(&data_slice[0]->out_length[1], data_slice[0]->markers.GetPointer(util::DEVICE) + num_elements -1, sizeof(SizeT), cudaMemcpyDeviceToHost, stream); + //printf("num_lements = %d data_slice[%d]->out_length[1] = %d\n", num_elements, thread_num, data_slice[0]->out_length[1]);fflush(stdout); + if (enactor_stats->retval = util::GRError(cudaStreamSynchronize(stream), "cudaStramSynchronize failed", __FILE__, __LINE__)) return; + //printf("num_lements = %d data_slice[%d]->out_length[1] = %d\n", num_elements, thread_num, data_slice[0]->out_length[1]);fflush(stdout); + for (peer_ = 2; peer_ < num_gpus; peer_++) + data_slice[0]->out_length[peer_] = data_slice[0]->out_length[1]; + data_slice[0]->out_length[0] = frontier_attribute->queue_length; +} + +/*static void Check_Queue_Size( + int thread_num, + int peer_, + SizeT request_length, + util::DoubleBuffer + *frontier_queue, + //util::Array1D *scanned_edges, + FrontierAttribute *frontier_attribute, + EnactorStats *enactor_stats, + //DataSlice *data_slice, + //DataSlice *d_data_slice, + GraphSlice *graph_slice + //util::CtaWorkProgressLifetime *work_progress, + //ContextPtr context, + //cudaStream_t stream + ) +{ + bool over_sized = false; + int selector = frontier_attribute->selector; + int iteration = enactor_stats -> iteration; + + if (Enactor::DEBUG) + printf("%d\t %d\t %d\t queue_length = %d, output_length = %d\n", + thread_num, iteration, peer_, + frontier_queue->keys[selector^1].GetSize(), + request_length);fflush(stdout); + + if (enactor_stats->retval = + Check_Size ("queue3", request_length, &frontier_queue->keys [selector^1], over_sized, thread_num, iteration, peer_, false)) return; + if (enactor_stats->retval = + Check_Size ("queue3", graph_slice->nodes+2, &frontier_queue->keys [selector ], over_sized, thread_num, iteration, peer_, true )) return; + if (Problem::USE_DOUBLE_BUFFER) + { + if (enactor_stats->retval = + Check_Size ("queue3", request_length, &frontier_queue->values[selector^1], over_sized, thread_num, iteration, peer_, false)) return; + if (enactor_stats->retval = + Check_Size ("queue3", graph_slice->nodes+2, &frontier_queue->values[selector ], over_sized, thread_num, iteration, peer_, true )) return; + } +} */ - //if (DEBUG && (retval = util::GRError(cudaThreadSynchronize(), - // "edge_map_forward::Kernel failed", __FILE__, __LINE__))) break; - enactor_stats -> Accumulate( - work_progress -> GetQueueLengthPointer(frontier_attribute->queue_index+1), stream); +}; // end R0DIteration - gunrock::oprtr::filter::Kernel - <<filter_grid_size, FilterKernelPolicy::THREADS, 0, stream>>>( +template < +typename AdvanceKernelPolicy, +typename FilterKernelPolicy, +typename Enactor> +struct PRIteration : public IterationBase < +AdvanceKernelPolicy, FilterKernelPolicy, Enactor, +false, //HAS_SUBQ +true, //HAS_FULLQ +false, //BACKWARD +true, //FORWARD +false> //UPDATE_PREDECESSORS +{ +public: +typedef typename Enactor::SizeT SizeT ; +typedef typename Enactor::Value Value ; +typedef typename Enactor::VertexId VertexId ; +typedef typename Enactor::Problem Problem ; +typedef typename Problem::DataSlice DataSlice ; +typedef GraphSlice GraphSlice; +typedef PRFunctor PrFunctor; +typedef PRMarkerFunctor PrMarkerFunctor; + +static void FullQueue_Core( + int thread_num, + int peer_, + util::DoubleBuffer + *frontier_queue, + util::Array1D *scanned_edges, + FrontierAttribute *frontier_attribute, + EnactorStats *enactor_stats, + DataSlice *data_slice, + DataSlice *d_data_slice, + GraphSlice *graph_slice, + util::CtaWorkProgressLifetime *work_progress, + ContextPtr context, + cudaStream_t stream) +{ + //Print_Const<<<1,1,0,stream>>>(d_data_slice); + //for (int i=0; i<3; i++) + //{ + //if (enactor_stats -> iteration != 0 || i!=0) + if (enactor_stats -> iteration != 0) + { + frontier_attribute->queue_length = data_slice -> edge_map_queue_len; + enactor_stats->total_queued[0] += frontier_attribute->queue_length; + + //printf("Filter start.\n");fflush(stdout); + // filter kernel + gunrock::oprtr::filter::Kernel + <<filter_grid_size, FilterKernelPolicy::THREADS, 0, stream>>>( enactor_stats->iteration, frontier_attribute->queue_reset, frontier_attribute->queue_index, frontier_attribute->queue_length, frontier_queue->keys[frontier_attribute->selector ].GetPointer(util::DEVICE), // d_in_queue NULL, - frontier_queue->keys[frontier_attribute->selector^1].GetPointer(util::DEVICE), // d_out_queue + NULL,//frontier_queue->keys[frontier_attribute->selector^1].GetPointer(util::DEVICE),// d_out_queue d_data_slice, NULL, work_progress[0], @@ -391,54 +699,133 @@ public: frontier_queue->keys[frontier_attribute->selector^1].GetSize(), // max_out_queue enactor_stats->filter_kernel_stats); - //if (DEBUG && (retval = util::GRError(cudaThreadSynchronize(), - // "filter::Kernel RemoveZeroFunctor failed", __FILE__, __LINE__))) - // break; + //if (DEBUG && (retval = util::GRError(cudaThreadSynchronize(), "filter_forward::Kernel failed", __FILE__, __LINE__))) break; + //cudaEventQuery(throttle_event); // give host memory mapped visibility to GPU updates - Clear_Zero_R0D - <<<128, 128, 0, stream>>> ( - graph_slice->nodes, - data_slice -> degrees.GetPointer(util::DEVICE), - data_slice -> degrees_pong.GetPointer(util::DEVICE)); + //printf("Filter end.\n");fflush(stdout); + //enactor_stats->iteration++; + frontier_attribute->queue_index++; + + if (enactor_stats->retval = work_progress->GetQueueLength(frontier_attribute->queue_index, frontier_attribute->queue_length, false, stream)) return; + //num_elements = queue_length; + //swap rank_curr and rank_next util::MemsetCopyVectorKernel<<<128, 128, 0, stream>>>( - data_slice->degrees.GetPointer(util::DEVICE), - data_slice->degrees_pong.GetPointer(util::DEVICE), graph_slice->nodes); + data_slice->rank_curr.GetPointer(util::DEVICE), + data_slice->rank_next.GetPointer(util::DEVICE), + graph_slice->nodes); + util::MemsetKernel<<<128, 128, 0, stream>>>( + data_slice->rank_next.GetPointer(util::DEVICE), + (Value)0.0, graph_slice->nodes); + + if (enactor_stats->retval = util::GRError(cudaStreamSynchronize(stream), "cudaStreamSynchronize failed", __FILE__, __LINE__)); + data_slice->PR_queue_length = frontier_attribute->queue_length; + //enactor_stats -> Accumulate( + // work_progress -> GetQueueLengthPointer(frontier_attribute->queue_index), stream); + //printf("queue_length = %d\n", frontier_attribute->queue_length);fflush(stdout); + if (false) {//if (INSTRUMENT || DEBUG) { + //if (enactor_stats->retval = work_progress->GetQueueLength(frontier_attribute->queue_index, frontier_attribute->queue_length,false,stream)) return; + //enactor_stats->total_queued += frontier_attribute->queue_length; + //if (DEBUG) printf(", %lld", (long long) frontier_attribute.queue_length); + if (Enactor::INSTRUMENT) { + if (enactor_stats->retval = enactor_stats->filter_kernel_stats.Accumulate( + enactor_stats->filter_grid_size, + enactor_stats->total_runtimes, + enactor_stats->total_lifetimes, + false, stream)) return; + } + } + } - //util::DisplayDeviceResults(problem->data_slices[0]->d_degrees, - // graph_slice->nodes); + //if (enactor_stats->retval = work_progress->SetQueueLength(frontier_attribute->queue_index, edge_map_queue_len)) return; + frontier_attribute->queue_length = data_slice->edge_map_queue_len; + //if (enactor_stats->iteration == 0) util::cpu_mt::PrintGPUArray("keys", frontier_queue->keys[frontier_attribute->selector].GetPointer(util::DEVICE), frontier_attribute->queue_length, thread_num, enactor_stats->iteration, peer_, stream); + //if (enactor_stats->iteration == 0) util::cpu_mt::PrintGPUArray("degrees", data_slice->degrees.GetPointer(util::DEVICE), graph_slice->nodes, thread_num, enactor_stats->iteration, peer_, stream); + //util::cpu_mt::PrintGPUArray("ranks", data_slice->rank_curr.GetPointer(util::DEVICE), graph_slice->nodes, thread_num, enactor_stats->iteration, peer_, stream); + + //printf("Advance start.\n");fflush(stdout); + // Edge Map + frontier_attribute->queue_reset = true; + gunrock::oprtr::advance::LaunchKernel( + //d_done, + enactor_stats[0], + frontier_attribute[0], + d_data_slice, + (VertexId*)NULL, + (bool* )NULL, + (bool* )NULL, + scanned_edges->GetPointer(util::DEVICE), + frontier_queue->keys[frontier_attribute->selector ].GetPointer(util::DEVICE), // d_in_queue + NULL, //frontier_queue->keys[frontier_attribute->selector^1].GetPointer(util::DEVICE), // d_out_queue + (VertexId*)NULL, + (VertexId*)NULL, + graph_slice->row_offsets .GetPointer(util::DEVICE), + graph_slice->column_indices.GetPointer(util::DEVICE), + (SizeT* )NULL, + (VertexId*)NULL, + graph_slice->nodes, //graph_slice->frontier_elements[frontier_attribute.selector], // max_in_queue + graph_slice->edges, //graph_slice->frontier_elements[frontier_attribute.selector^1],// max_out_queue + work_progress[0], + context[0], + stream, + gunrock::oprtr::advance::V2V, + false, + false); + + if (enactor_stats->retval = work_progress->GetQueueLength(frontier_attribute->queue_index+1, frontier_attribute->queue_length, false, stream, true)) return; + if (enactor_stats->retval = cudaStreamSynchronize(stream)) return; + enactor_stats->total_queued[0] += frontier_attribute->queue_length; + frontier_attribute->queue_length = data_slice->edge_map_queue_len; + //printf("Advance end.\n");fflush(stdout); - frontier_attribute->queue_index++; - frontier_attribute->selector^=1; - if (enactor_stats->retval = work_progress->GetQueueLength(frontier_attribute->queue_index, frontier_attribute->queue_length, false, stream)) return; - if (enactor_stats->retval = util::GRError(cudaStreamSynchronize(stream), "cudaStreamSynchronize failed", __FILE__, __LINE__)) return; - //enactor_stats->total_queued[0] += frontier_attribute->queue_length; - //util::cpu_mt::PrintGPUArray("keys1", frontier_queue->keys[frontier_attribute->selector].GetPointer(util::DEVICE), frontier_attribute->queue_length, thread_num, enactor_stats->iteration, peer_, stream); - //util::cpu_mt::PrintGPUArray("degrees1", data_slice->degrees.GetPointer(util::DEVICE), graph_slice->nodes, thread_num, enactor_stats->iteration, peer_, stream); + //if (DEBUG && (retval = util::GRError(cudaThreadSynchronize(), "edge_map_forward::Kernel failed", __FILE__, __LINE__))) break; + //cudaEventQuery(throttle_event); // give host memory mapped visibility to GPU updates - if (num_valid_node == frontier_attribute->queue_length || num_valid_node==0) data_slice->to_continue = false; - else data_slice->to_continue = true; + /*if (Enactor::DEBUG) { + if (enactor_stats->retval = work_progress->GetQueueLength(frontier_attribute->queue_index, frontier_attribute->queue_length, false, stream)) return; } - static cudaError_t Compute_OutputLength( - FrontierAttribute *frontier_attribute, - SizeT *d_offsets, - VertexId *d_indices, - VertexId *d_in_key_queue, - util::Array1D *partitioned_scanned_edges, - SizeT max_in, - SizeT max_out, - CudaContext &context, - cudaStream_t stream, - gunrock::oprtr::advance::TYPE ADVANCE_TYPE, - bool express = false) + if (Enactor::INSTRUMENT) { + if (enactor_stats->retval = enactor_stats->advance_kernel_stats.Accumulate( + enactor_stats->advance_grid_size, + enactor_stats->total_runtimes, + enactor_stats->total_lifetimes, false, stream)) return; + }*/ + + //if (done[0] == 0) break; + + //if (enactor_stats->retval = work_progress->SetQueueLength(frontier_attribute->queue_index, edge_map_queue_len)) return; + + //if (done[0] == 0 || frontier_attribute.queue_length == 0 || enactor_stats.iteration > max_iteration) break; + + //if (DEBUG) printf("\n%lld", (long long) enactor_stats.iteration); + //} +} + +static cudaError_t Compute_OutputLength( + FrontierAttribute *frontier_attribute, + SizeT *d_offsets, + VertexId *d_indices, + VertexId *d_in_key_queue, + util::Array1D *partitioned_scanned_edges, + SizeT max_in, + SizeT max_out, + CudaContext &context, + cudaStream_t stream, + gunrock::oprtr::advance::TYPE ADVANCE_TYPE, + bool express = false) +{ + //printf("Compute_OutputLength start.\n");fflush(stdout); + cudaError_t retval = cudaSuccess; + if (AdvanceKernelPolicy::ADVANCE_MODE == gunrock::oprtr::advance::TWC_FORWARD) { - cudaError_t retval = cudaSuccess; + //return retval; + } else { bool over_sized = false; if (retval = Check_Size ( "scanned_edges", frontier_attribute->queue_length, partitioned_scanned_edges, over_sized, -1, -1, -1, false)) return retval; retval = gunrock::oprtr::advance::ComputeOutputLength - ( + ( frontier_attribute, d_offsets, d_indices, @@ -449,314 +836,114 @@ public: context, stream, ADVANCE_TYPE, - express); - return retval; - } - - template - static void Expand_Incoming( - int grid_size, - int block_size, - size_t shared_size, - cudaStream_t stream, - SizeT &num_elements, - const VertexId* const keys_in, - util::Array1D* keys_out, - const size_t array_size, - char* array, - DataSlice* data_slice) - { - Expand_Incoming_R0D - - <<>> ( - num_elements, - keys_in, - data_slice->degrees.GetPointer(util::DEVICE)); - num_elements = 0; + express); } - - static bool Stop_Condition( - EnactorStats *enactor_stats, - FrontierAttribute *frontier_attribute, - util::Array1D *data_slice, - int num_gpus) + //printf("Compute_OutputLength end.\n");fflush(stdout); + return retval; +} + +template +static void Expand_Incoming( + int grid_size, + int block_size, + size_t shared_size, + cudaStream_t stream, + SizeT &num_elements, + const VertexId* const keys_in, + util::Array1D* keys_out, + const size_t array_size, + char* array, + DataSlice* data_slice) +{ + //util::cpu_mt::PrintCPUArray("Incoming_length", &num_elements, 1, data_slice->gpu_idx); + Expand_Incoming_PR + + <<>> ( + num_elements, + keys_in, + array_size, + array); + num_elements = 0; +} + +static bool Stop_Condition ( + EnactorStats *enactor_stats, + FrontierAttribute *frontier_attribute, + util::Array1D *data_slice, + int num_gpus) +{ + bool all_zero = true; + for (int gpu = 0; gpu < num_gpus*num_gpus; gpu++) + if (enactor_stats[gpu].retval != cudaSuccess) { - //printf("CC Stop checked\n");fflush(stdout); - for (int gpu = 0; gpu < num_gpus*num_gpus; gpu++) - if (enactor_stats[gpu].retval != cudaSuccess) - { - printf("(CUDA error %d @ GPU %d: %s\n", enactor_stats[gpu].retval, gpu%num_gpus, cudaGetErrorString(enactor_stats[gpu].retval)); fflush(stdout); - return true; - } - - /*for (int gpu = 0; gpu< num_gpus*num_gpus; gpu++) - if (enactor_stats[gpu].iteration == 0) - { - printf("enactor_stats[%d].iteration ==0\n", gpu);fflush(stdout); - return false; - }*/ - - for (int gpu=0; gputo_continue && frontier_attribute[gpu*num_gpus].queue_length !=0) - { - //printf("data_slice[%d]->to_continue, frontier_attribute[%d].queue_length = %d\n", gpu, gpu*num_gpus, frontier_attribute[gpu*num_gpus].queue_length);fflush(stdout); - return false; - } - - for (int gpu=0; gpuin_length[i][peer]!=0) - { - //printf("data_slice[%d]->in_length[%d][%d] = %d\n", gpu, i, peer, data_slice[gpu]->in_length[i][peer]);fflush(stdout); - return false; - } - - for (int gpu=0; gpuout_length[peer]!=0) - { - //printf("data_slice[%d]->out_length[%d] = %d\n", gpu, peer, data_slice[gpu]->out_length[peer]); fflush(stdout); - return false; - } - //printf("CC to stop\n");fflush(stdout); + //printf("(CUDA error %d @ GPU %d: %s\n", enactor_stats[gpu].retval, gpu%num_gpus, cudaGetErrorString(enactor_stats[gpu].retval)); fflush(stdout); return true; - } + } - template < - int NUM_VERTEX_ASSOCIATES, - int NUM_VALUE__ASSOCIATES> - static void Make_Output( - int thread_num, - SizeT num_elements, - int num_gpus, - util::DoubleBuffer - *frontier_queue, - util::Array1D *scanned_edges, - FrontierAttribute *frontier_attribute, - EnactorStats *enactor_stats, - util::Array1D - *data_slice, - GraphSlice *graph_slice, - util::CtaWorkProgressLifetime *work_progress, - ContextPtr context, - cudaStream_t stream) + for (int gpu =0; gpu < num_gpus; gpu++) + if (data_slice[gpu]->PR_queue_length > 0) { - if (num_elements == 0) - { - for (int peer_ =0; peer_out_length[peer_] = 0; - return; - } - - int block_size = 256; - int grid_size = num_elements / block_size; - int peer_ = 0; - if ((num_elements % block_size)!=0) grid_size ++; - if (grid_size > 512) grid_size = 512; - - //util::MemsetKernel<<<128, 128, 0, stream>>>(data_slice[0]->markers.GetPointer(util::DEVICE), 0, num_elements); - Mark_Queue_R0D - <<>> ( - num_elements, - frontier_queue->keys[frontier_attribute->selector].GetPointer(util::DEVICE), - data_slice[0] -> degrees.GetPointer(util::DEVICE), - data_slice[0] -> markers.GetPointer(util::DEVICE)); - //util::cpu_mt::PrintGPUArray("markers", data_slice[0]->markers.GetPointer(util::DEVICE), num_elements, thread_num, enactor_stats->iteration, -1, stream); - - Scan( - (int*)data_slice[0] -> markers.GetPointer(util::DEVICE), - num_elements, - (int)0, mgpu::plus(), (int*)0, (int*)0, - (int*)data_slice[0] -> markers.GetPointer(util::DEVICE), - context[0]); - - Make_Queue_R0D - <<>> ( - num_elements, - frontier_queue->keys[frontier_attribute->selector].GetPointer(util::DEVICE), - data_slice[0]->markers.GetPointer(util::DEVICE), - data_slice[0]->keys_out[1].GetPointer(util::DEVICE)); - - if (!Enactor::SIZE_CHECK) - util::MemsetCopyVectorKernel <<>>( - data_slice[0]->frontier_queues[0].keys[frontier_attribute->selector].GetPointer(util::DEVICE), - frontier_queue->keys[frontier_attribute->selector].GetPointer(util::DEVICE), - num_elements); - - cudaMemcpyAsync(&data_slice[0]->out_length[1], data_slice[0]->markers.GetPointer(util::DEVICE) + num_elements -1, sizeof(SizeT), cudaMemcpyDeviceToHost, stream); - //printf("num_lements = %d data_slice[%d]->out_length[1] = %d\n", num_elements, thread_num, data_slice[0]->out_length[1]);fflush(stdout); - if (enactor_stats->retval = util::GRError(cudaStreamSynchronize(stream), "cudaStramSynchronize failed", __FILE__, __LINE__)) return; - //printf("num_lements = %d data_slice[%d]->out_length[1] = %d\n", num_elements, thread_num, data_slice[0]->out_length[1]);fflush(stdout); - for (peer_ = 2; peer_ < num_gpus; peer_++) - data_slice[0]->out_length[peer_] = data_slice[0]->out_length[1]; - data_slice[0]->out_length[0] = frontier_attribute->queue_length; + //printf("data_slice[%d].PR_queue_length = %d\n", gpu, data_slice[gpu]->PR_queue_length); + all_zero = false; } + if (all_zero) return true; - /*static void Check_Queue_Size( - int thread_num, - int peer_, - SizeT request_length, - util::DoubleBuffer - *frontier_queue, - //util::Array1D *scanned_edges, - FrontierAttribute *frontier_attribute, - EnactorStats *enactor_stats, - //DataSlice *data_slice, - //DataSlice *d_data_slice, - GraphSlice *graph_slice - //util::CtaWorkProgressLifetime *work_progress, - //ContextPtr context, - //cudaStream_t stream - ) - { - bool over_sized = false; - int selector = frontier_attribute->selector; - int iteration = enactor_stats -> iteration; - - if (Enactor::DEBUG) - printf("%d\t %d\t %d\t queue_length = %d, output_length = %d\n", - thread_num, iteration, peer_, - frontier_queue->keys[selector^1].GetSize(), - request_length);fflush(stdout); - - if (enactor_stats->retval = - Check_Size ("queue3", request_length, &frontier_queue->keys [selector^1], over_sized, thread_num, iteration, peer_, false)) return; - if (enactor_stats->retval = - Check_Size ("queue3", graph_slice->nodes+2, &frontier_queue->keys [selector ], over_sized, thread_num, iteration, peer_, true )) return; - if (Problem::USE_DOUBLE_BUFFER) - { - if (enactor_stats->retval = - Check_Size ("queue3", request_length, &frontier_queue->values[selector^1], over_sized, thread_num, iteration, peer_, false)) return; - if (enactor_stats->retval = - Check_Size ("queue3", graph_slice->nodes+2, &frontier_queue->values[selector ], over_sized, thread_num, iteration, peer_, true )) return; - } - } */ + for (int gpu =0; gpu < num_gpus; gpu++) + if (enactor_stats[gpu * num_gpus].iteration < data_slice[0]->max_iter) + { + //printf("enactor_stats[%d].iteration = %lld\n", gpu, enactor_stats[gpu * num_gpus].iteration); + return false; + } -}; // end R0DIteration + return true; +} template < - typename AdvanceKernelPolicy, - typename FilterKernelPolicy, - typename Enactor> -struct PRIteration : public IterationBase < - AdvanceKernelPolicy, FilterKernelPolicy, Enactor, - false, //HAS_SUBQ - true, //HAS_FULLQ - false, //BACKWARD - true, //FORWARD - false> //UPDATE_PREDECESSORS + int NUM_VERTEX_ASSOCIATES, + int NUM_VALUE__ASSOCIATES> +static void Make_Output( + int thread_num, + SizeT num_elements, + int num_gpus, + util::DoubleBuffer + *frontier_queue, + util::Array1D *scanned_edges, + FrontierAttribute *frontier_attribute, + EnactorStats *enactor_stats, + util::Array1D + *data_slice, + GraphSlice *graph_slice, + util::CtaWorkProgressLifetime *work_progress, + ContextPtr context, + cudaStream_t stream) { -public: - typedef typename Enactor::SizeT SizeT ; - typedef typename Enactor::Value Value ; - typedef typename Enactor::VertexId VertexId ; - typedef typename Enactor::Problem Problem ; - typedef typename Problem::DataSlice DataSlice ; - typedef GraphSlice GraphSlice; - typedef PRFunctor PrFunctor; - typedef PRMarkerFunctor PrMarkerFunctor; - - static void FullQueue_Core( - int thread_num, - int peer_, - util::DoubleBuffer - *frontier_queue, - util::Array1D *scanned_edges, - FrontierAttribute *frontier_attribute, - EnactorStats *enactor_stats, - DataSlice *data_slice, - DataSlice *d_data_slice, - GraphSlice *graph_slice, - util::CtaWorkProgressLifetime *work_progress, - ContextPtr context, - cudaStream_t stream) + //printf("Make_Output entered\n");fflush(stdout); + int peer_ = 0; + int block_size = 512; + int grid_size = graph_slice->nodes / block_size; + if ((graph_slice->nodes % block_size)!=0) grid_size ++; + if (grid_size > 512) grid_size = 512; + + if (num_gpus > 1 && enactor_stats->iteration==0) { - //Print_Const<<<1,1,0,stream>>>(d_data_slice); - //for (int i=0; i<3; i++) - //{ - //if (enactor_stats -> iteration != 0 || i!=0) - if (enactor_stats -> iteration != 0) - { - frontier_attribute->queue_length = data_slice -> edge_map_queue_len; - enactor_stats->total_queued[0] += frontier_attribute->queue_length; - - //printf("Filter start.\n");fflush(stdout); - // filter kernel - gunrock::oprtr::filter::Kernel - <<filter_grid_size, FilterKernelPolicy::THREADS, 0, stream>>>( - enactor_stats->iteration, - frontier_attribute->queue_reset, - frontier_attribute->queue_index, - frontier_attribute->queue_length, - frontier_queue->keys[frontier_attribute->selector ].GetPointer(util::DEVICE), // d_in_queue - NULL, - NULL,//frontier_queue->keys[frontier_attribute->selector^1].GetPointer(util::DEVICE),// d_out_queue - d_data_slice, - NULL, - work_progress[0], - frontier_queue->keys[frontier_attribute->selector ].GetSize(), // max_in_queue - frontier_queue->keys[frontier_attribute->selector^1].GetSize(), // max_out_queue - enactor_stats->filter_kernel_stats); - - //if (DEBUG && (retval = util::GRError(cudaThreadSynchronize(), "filter_forward::Kernel failed", __FILE__, __LINE__))) break; - //cudaEventQuery(throttle_event); // give host memory mapped visibility to GPU updates - - //printf("Filter end.\n");fflush(stdout); - //enactor_stats->iteration++; - frontier_attribute->queue_index++; - - if (enactor_stats->retval = work_progress->GetQueueLength(frontier_attribute->queue_index, frontier_attribute->queue_length, false, stream)) return; - //num_elements = queue_length; - - //swap rank_curr and rank_next - util::MemsetCopyVectorKernel<<<128, 128, 0, stream>>>( - data_slice->rank_curr.GetPointer(util::DEVICE), - data_slice->rank_next.GetPointer(util::DEVICE), - graph_slice->nodes); - util::MemsetKernel<<<128, 128, 0, stream>>>( - data_slice->rank_next.GetPointer(util::DEVICE), - (Value)0.0, graph_slice->nodes); - - if (enactor_stats->retval = util::GRError(cudaStreamSynchronize(stream), "cudaStreamSynchronize failed", __FILE__, __LINE__)); - data_slice->PR_queue_length = frontier_attribute->queue_length; - //enactor_stats -> Accumulate( - // work_progress -> GetQueueLengthPointer(frontier_attribute->queue_index), stream); - //printf("queue_length = %d\n", frontier_attribute->queue_length);fflush(stdout); - if (false) {//if (INSTRUMENT || DEBUG) { - //if (enactor_stats->retval = work_progress->GetQueueLength(frontier_attribute->queue_index, frontier_attribute->queue_length,false,stream)) return; - //enactor_stats->total_queued += frontier_attribute->queue_length; - //if (DEBUG) printf(", %lld", (long long) frontier_attribute.queue_length); - if (Enactor::INSTRUMENT) { - if (enactor_stats->retval = enactor_stats->filter_kernel_stats.Accumulate( - enactor_stats->filter_grid_size, - enactor_stats->total_runtimes, - enactor_stats->total_lifetimes, - false, stream)) return; - } - } - } - - //if (enactor_stats->retval = work_progress->SetQueueLength(frontier_attribute->queue_index, edge_map_queue_len)) return; - frontier_attribute->queue_length = data_slice->edge_map_queue_len; - //if (enactor_stats->iteration == 0) util::cpu_mt::PrintGPUArray("keys", frontier_queue->keys[frontier_attribute->selector].GetPointer(util::DEVICE), frontier_attribute->queue_length, thread_num, enactor_stats->iteration, peer_, stream); - //if (enactor_stats->iteration == 0) util::cpu_mt::PrintGPUArray("degrees", data_slice->degrees.GetPointer(util::DEVICE), graph_slice->nodes, thread_num, enactor_stats->iteration, peer_, stream); - //util::cpu_mt::PrintGPUArray("ranks", data_slice->rank_curr.GetPointer(util::DEVICE), graph_slice->nodes, thread_num, enactor_stats->iteration, peer_, stream); - - //printf("Advance start.\n");fflush(stdout); + util::MemsetKernel<<>>(data_slice[0]->markers.GetPointer(util::DEVICE), (SizeT)0, graph_slice->nodes); + frontier_attribute->queue_length = data_slice[0]->edge_map_queue_len; + //util::cpu_mt::PrintGPUArray("keys", frontier_queue->keys[frontier_attribute->selector].GetPointer(util::DEVICE), frontier_attribute->queue_length, thread_num, enactor_stats->iteration, -1, stream); + //util::cpu_mt::PrintGPUArray("row_offsets", graph_slice->row_offsets.GetPointer(util::DEVICE), graph_slice->nodes+1, thread_num, enactor_stats->iteration, -1, stream); + //printf("Advance start.\n");fflush(stdout); + frontier_attribute->queue_reset = true; // Edge Map - frontier_attribute->queue_reset = true; - gunrock::oprtr::advance::LaunchKernel( + gunrock::oprtr::advance::LaunchKernel( //d_done, enactor_stats[0], frontier_attribute[0], - d_data_slice, + data_slice->GetPointer(util::DEVICE), (VertexId*)NULL, (bool* )NULL, (bool* )NULL, scanned_edges->GetPointer(util::DEVICE), frontier_queue->keys[frontier_attribute->selector ].GetPointer(util::DEVICE), // d_in_queue - NULL, //frontier_queue->keys[frontier_attribute->selector^1].GetPointer(util::DEVICE), // d_out_queue + frontier_queue->keys[frontier_attribute->selector^1].GetPointer(util::DEVICE), // d_out_queue (VertexId*)NULL, (VertexId*)NULL, graph_slice->row_offsets .GetPointer(util::DEVICE), @@ -770,392 +957,205 @@ public: stream, gunrock::oprtr::advance::V2V, false, - false); + true); + //printf("Advance end.\n");fflush(stdout); + //util::cpu_mt::PrintGPUArray("markers", data_slice[0]->markers.GetPointer(util::DEVICE), graph_slice->nodes, thread_num, enactor_stats->iteration, -1, stream); - if (enactor_stats->retval = work_progress->GetQueueLength(frontier_attribute->queue_index+1, frontier_attribute->queue_length, false, stream, true)) return; - if (enactor_stats->retval = cudaStreamSynchronize(stream)) return; - enactor_stats->total_queued[0] += frontier_attribute->queue_length; - frontier_attribute->queue_length = data_slice->edge_map_queue_len; - //printf("Advance end.\n");fflush(stdout); - - //if (DEBUG && (retval = util::GRError(cudaThreadSynchronize(), "edge_map_forward::Kernel failed", __FILE__, __LINE__))) break; - //cudaEventQuery(throttle_event); // give host memory mapped visibility to GPU updates - - /*if (Enactor::DEBUG) { - if (enactor_stats->retval = work_progress->GetQueueLength(frontier_attribute->queue_index, frontier_attribute->queue_length, false, stream)) return; - } - - if (Enactor::INSTRUMENT) { - if (enactor_stats->retval = enactor_stats->advance_kernel_stats.Accumulate( - enactor_stats->advance_grid_size, - enactor_stats->total_runtimes, - enactor_stats->total_lifetimes, false, stream)) return; - }*/ - - //if (done[0] == 0) break; - - //if (enactor_stats->retval = work_progress->SetQueueLength(frontier_attribute->queue_index, edge_map_queue_len)) return; - - //if (done[0] == 0 || frontier_attribute.queue_length == 0 || enactor_stats.iteration > max_iteration) break; - - //if (DEBUG) printf("\n%lld", (long long) enactor_stats.iteration); - //} - } - - static cudaError_t Compute_OutputLength( - FrontierAttribute *frontier_attribute, - SizeT *d_offsets, - VertexId *d_indices, - VertexId *d_in_key_queue, - util::Array1D *partitioned_scanned_edges, - SizeT max_in, - SizeT max_out, - CudaContext &context, - cudaStream_t stream, - gunrock::oprtr::advance::TYPE ADVANCE_TYPE, - bool express = false) - { - //printf("Compute_OutputLength start.\n");fflush(stdout); - cudaError_t retval = cudaSuccess; - if (AdvanceKernelPolicy::ADVANCE_MODE == gunrock::oprtr::advance::TWC_FORWARD) + for (peer_ = 0; peer_>> ( data_slice[0]->keys_marker[peer_].GetPointer(util::DEVICE), 0, graph_slice->nodes); + Assign_Marker_PR + <<>> ( + graph_slice->nodes, + num_gpus, + data_slice[0]->markers.GetPointer(util::DEVICE), + graph_slice->partition_table.GetPointer(util::DEVICE), + data_slice[0]->keys_markers.GetPointer(util::DEVICE)); + //for (peer_ = 0; peer_keys_marker[peer_].GetPointer(util::DEVICE), graph_slice->nodes, thread_num, enactor_stats->iteration, -1, stream); + + for (peer_ = 0; peer_( + (int*)(data_slice[0]->keys_marker[peer_].GetPointer(util::DEVICE)), + graph_slice->nodes, + (int)0, mgpu::plus(), (int*)0, (int*)0, + (int*)(data_slice[0]->keys_marker[peer_].GetPointer(util::DEVICE)), + context[0]); + //for (peer_ = 0; peer_keys_marker[peer_].GetPointer(util::DEVICE), graph_slice->nodes, thread_num, enactor_stats->iteration, -1, stream); + + SizeT temp_length = data_slice[0]->out_length[0]; + if (graph_slice->nodes > 0) for (peer_ = 0; peer_out_length[peer_], + data_slice[0]->keys_marker[peer_].GetPointer(util::DEVICE) + (graph_slice->nodes -1), + sizeof(SizeT), cudaMemcpyDeviceToHost, stream); } else { - bool over_sized = false; - if (retval = Check_Size ( - "scanned_edges", frontier_attribute->queue_length, partitioned_scanned_edges, over_sized, -1, -1, -1, false)) return retval; - retval = gunrock::oprtr::advance::ComputeOutputLength - ( - frontier_attribute, - d_offsets, - d_indices, - d_in_key_queue, - partitioned_scanned_edges->GetPointer(util::DEVICE), - max_in, - max_out, - context, - stream, - ADVANCE_TYPE, - express); - } - //printf("Compute_OutputLength end.\n");fflush(stdout); - return retval; - } - - template - static void Expand_Incoming( - int grid_size, - int block_size, - size_t shared_size, - cudaStream_t stream, - SizeT &num_elements, - const VertexId* const keys_in, - util::Array1D* keys_out, - const size_t array_size, - char* array, - DataSlice* data_slice) - { - //util::cpu_mt::PrintCPUArray("Incoming_length", &num_elements, 1, data_slice->gpu_idx); - Expand_Incoming_PR - - <<>> ( - num_elements, - keys_in, - array_size, - array); - num_elements = 0; - } - - static bool Stop_Condition ( - EnactorStats *enactor_stats, - FrontierAttribute *frontier_attribute, - util::Array1D *data_slice, - int num_gpus) - { - bool all_zero = true; - for (int gpu = 0; gpu < num_gpus*num_gpus; gpu++) - if (enactor_stats[gpu].retval != cudaSuccess) - { - //printf("(CUDA error %d @ GPU %d: %s\n", enactor_stats[gpu].retval, gpu%num_gpus, cudaGetErrorString(enactor_stats[gpu].retval)); fflush(stdout); - return true; - } - - for (int gpu =0; gpu < num_gpus; gpu++) - if (data_slice[gpu]->PR_queue_length > 0) - { - //printf("data_slice[%d].PR_queue_length = %d\n", gpu, data_slice[gpu]->PR_queue_length); - all_zero = false; + for (peer_ = 1; peer_out_length[peer_] = 0; } - if (all_zero) return true; + if (enactor_stats->retval = cudaStreamSynchronize(stream)) return; - for (int gpu =0; gpu < num_gpus; gpu++) - if (enactor_stats[gpu * num_gpus].iteration < data_slice[0]->max_iter) + for (peer_ = 0; peer_1) { + data_slice[0]->keys_out[peer_] = data_slice[0]->temp_keys_out[peer_]; + data_slice[0]->temp_keys_out[peer_] = util::Array1D(); + } + if (enactor_stats->retval = Check_Size ( + "keys_out", data_slice[0]->out_length[peer_], &data_slice[0]->keys_out[peer_], over_sized, thread_num, enactor_stats->iteration, peer_)) return; + if (peer_>0) + if (enactor_stats->retval = Check_Size ( + "values_out", data_slice[0]->out_length[peer_], &data_slice[0]->value__associate_out[peer_][0], over_sized, thread_num, enactor_stats->iteration, peer_)) return; + data_slice[0]->keys_outs[peer_] = data_slice[0]->keys_out[peer_].GetPointer(util::DEVICE); + if (!over_sized) continue; + data_slice[0]->value__associate_outs[peer_][0] = data_slice[0]->value__associate_out[peer_][0].GetPointer(util::DEVICE); + data_slice[0]->value__associate_outs[peer_].Move(util::HOST, util::DEVICE, -1, 0, stream); } + data_slice[0]->keys_outs.Move(util::HOST, util::DEVICE, -1, 0, stream); + data_slice[0]->out_length[0] = temp_length; - return true; - } + Assign_Keys_PR + <<>> ( + graph_slice->nodes, + num_gpus, + graph_slice->partition_table.GetPointer(util::DEVICE), + data_slice[0]->markers .GetPointer(util::DEVICE), + data_slice[0]->keys_markers .GetPointer(util::DEVICE), + data_slice[0]->keys_outs .GetPointer(util::DEVICE)); + + //util::cpu_mt::PrintCPUArray("out_length", &data_slice[0]->out_length[0], num_gpus, thread_num, enactor_stats->iteration); + //for (peer_ = 0; peer_keys_out[peer_].GetPointer(util::DEVICE), data_slice[0]->out_length[peer_], thread_num, enactor_stats->iteration, peer_, stream); + } - template < - int NUM_VERTEX_ASSOCIATES, - int NUM_VALUE__ASSOCIATES> - static void Make_Output( - int thread_num, - SizeT num_elements, - int num_gpus, - util::DoubleBuffer - *frontier_queue, - util::Array1D *scanned_edges, - FrontierAttribute *frontier_attribute, - EnactorStats *enactor_stats, - util::Array1D - *data_slice, - GraphSlice *graph_slice, - util::CtaWorkProgressLifetime *work_progress, - ContextPtr context, - cudaStream_t stream) + for (peer_ = 1; peer_ < num_gpus; peer_ ++) { - //printf("Make_Output entered\n");fflush(stdout); - int peer_ = 0; - int block_size = 512; - int grid_size = graph_slice->nodes / block_size; - if ((graph_slice->nodes % block_size)!=0) grid_size ++; - if (grid_size > 512) grid_size = 512; - - if (num_gpus > 1 && enactor_stats->iteration==0) - { - util::MemsetKernel<<>>(data_slice[0]->markers.GetPointer(util::DEVICE), (SizeT)0, graph_slice->nodes); - frontier_attribute->queue_length = data_slice[0]->edge_map_queue_len; - //util::cpu_mt::PrintGPUArray("keys", frontier_queue->keys[frontier_attribute->selector].GetPointer(util::DEVICE), frontier_attribute->queue_length, thread_num, enactor_stats->iteration, -1, stream); - //util::cpu_mt::PrintGPUArray("row_offsets", graph_slice->row_offsets.GetPointer(util::DEVICE), graph_slice->nodes+1, thread_num, enactor_stats->iteration, -1, stream); - //printf("Advance start.\n");fflush(stdout); - frontier_attribute->queue_reset = true; - // Edge Map - gunrock::oprtr::advance::LaunchKernel( - //d_done, - enactor_stats[0], - frontier_attribute[0], - data_slice->GetPointer(util::DEVICE), - (VertexId*)NULL, - (bool* )NULL, - (bool* )NULL, - scanned_edges->GetPointer(util::DEVICE), - frontier_queue->keys[frontier_attribute->selector ].GetPointer(util::DEVICE), // d_in_queue - frontier_queue->keys[frontier_attribute->selector^1].GetPointer(util::DEVICE), // d_out_queue - (VertexId*)NULL, - (VertexId*)NULL, - graph_slice->row_offsets .GetPointer(util::DEVICE), - graph_slice->column_indices.GetPointer(util::DEVICE), - (SizeT* )NULL, - (VertexId*)NULL, - graph_slice->nodes, //graph_slice->frontier_elements[frontier_attribute.selector], // max_in_queue - graph_slice->edges, //graph_slice->frontier_elements[frontier_attribute.selector^1],// max_out_queue - work_progress[0], - context[0], - stream, - gunrock::oprtr::advance::V2V, - false, - true); - //printf("Advance end.\n");fflush(stdout); - //util::cpu_mt::PrintGPUArray("markers", data_slice[0]->markers.GetPointer(util::DEVICE), graph_slice->nodes, thread_num, enactor_stats->iteration, -1, stream); - - for (peer_ = 0; peer_>> ( data_slice[0]->keys_marker[peer_].GetPointer(util::DEVICE), 0, graph_slice->nodes); - Assign_Marker_PR - <<>> ( - graph_slice->nodes, - num_gpus, - data_slice[0]->markers.GetPointer(util::DEVICE), - graph_slice->partition_table.GetPointer(util::DEVICE), - data_slice[0]->keys_markers.GetPointer(util::DEVICE)); - //for (peer_ = 0; peer_keys_marker[peer_].GetPointer(util::DEVICE), graph_slice->nodes, thread_num, enactor_stats->iteration, -1, stream); - - for (peer_ = 0; peer_( - (int*)(data_slice[0]->keys_marker[peer_].GetPointer(util::DEVICE)), - graph_slice->nodes, - (int)0, mgpu::plus(), (int*)0, (int*)0, - (int*)(data_slice[0]->keys_marker[peer_].GetPointer(util::DEVICE)), - context[0]); - //for (peer_ = 0; peer_keys_marker[peer_].GetPointer(util::DEVICE), graph_slice->nodes, thread_num, enactor_stats->iteration, -1, stream); - - SizeT temp_length = data_slice[0]->out_length[0]; - if (graph_slice->nodes > 0) for (peer_ = 0; peer_out_length[peer_], - data_slice[0]->keys_marker[peer_].GetPointer(util::DEVICE) + (graph_slice->nodes -1), - sizeof(SizeT), cudaMemcpyDeviceToHost, stream); - } else { - for (peer_ = 1; peer_out_length[peer_] = 0; - } - if (enactor_stats->retval = cudaStreamSynchronize(stream)) return; + Assign_Values_PR + <<>> ( + data_slice[0]->out_length[peer_], + data_slice[0]->keys_out[peer_].GetPointer(util::DEVICE), + data_slice[0]->rank_next.GetPointer(util::DEVICE), + data_slice[0]->value__associate_out[peer_][0].GetPointer(util::DEVICE)); + } + frontier_attribute->selector = data_slice[0]->PR_queue_selector; + //for (peer_ = 1; peer_ < num_gpus; peer_++) + //{ + // util::cpu_mt::PrintGPUArray("values_out[]", data_slice[0]->value__associate_out[peer_][0].GetPointer(util::DEVICE), data_slice[0]->out_length[peer_], thread_num, enactor_stats->iteration, peer_, stream); + //} + if (enactor_stats->retval = cudaStreamSynchronize(stream)) return; +} - for (peer_ = 0; peer_1) { - data_slice[0]->keys_out[peer_] = data_slice[0]->temp_keys_out[peer_]; - data_slice[0]->temp_keys_out[peer_] = util::Array1D(); - } - if (enactor_stats->retval = Check_Size ( - "keys_out", data_slice[0]->out_length[peer_], &data_slice[0]->keys_out[peer_], over_sized, thread_num, enactor_stats->iteration, peer_)) return; - if (peer_>0) - if (enactor_stats->retval = Check_Size ( - "values_out", data_slice[0]->out_length[peer_], &data_slice[0]->value__associate_out[peer_][0], over_sized, thread_num, enactor_stats->iteration, peer_)) return; - data_slice[0]->keys_outs[peer_] = data_slice[0]->keys_out[peer_].GetPointer(util::DEVICE); - if (!over_sized) continue; - data_slice[0]->value__associate_outs[peer_][0] = data_slice[0]->value__associate_out[peer_][0].GetPointer(util::DEVICE); - data_slice[0]->value__associate_outs[peer_].Move(util::HOST, util::DEVICE, -1, 0, stream); - } - data_slice[0]->keys_outs.Move(util::HOST, util::DEVICE, -1, 0, stream); - data_slice[0]->out_length[0] = temp_length; - - Assign_Keys_PR - <<>> ( - graph_slice->nodes, - num_gpus, - graph_slice->partition_table.GetPointer(util::DEVICE), - data_slice[0]->markers .GetPointer(util::DEVICE), - data_slice[0]->keys_markers .GetPointer(util::DEVICE), - data_slice[0]->keys_outs .GetPointer(util::DEVICE)); - - //util::cpu_mt::PrintCPUArray("out_length", &data_slice[0]->out_length[0], num_gpus, thread_num, enactor_stats->iteration); - //for (peer_ = 0; peer_keys_out[peer_].GetPointer(util::DEVICE), data_slice[0]->out_length[peer_], thread_num, enactor_stats->iteration, peer_, stream); - } +}; - for (peer_ = 1; peer_ < num_gpus; peer_ ++) +/** + * @brief Enacts a page rank computing on the specified graph. + * + * @tparam AdvanceKernelPolicy Kernel policy for advance operator. + * @tparam FilterKernelPolicy Kernel policy for filter operator. + * @tparam PRProblem PR Problem type. + * + * @param[in] context CudaContext pointer for moderngpu APIs. + * @param[in] problem PRProblem object. + * @param[in] max_iteration Maximum iteration number for PR. + * @param[in] max_grid_size Max grid size for PR kernel calls. + * + * \return cudaError_t object which indicates the success of all CUDA calls. + */ +template< + typename AdvanceKernelPolicy, + typename FilterKernelPolicy, + typename PrEnactor> +static CUT_THREADPROC PRThread( + void * thread_data_) +{ + typedef typename PrEnactor::Problem Problem; + typedef typename PrEnactor::SizeT SizeT; + typedef typename PrEnactor::VertexId VertexId; + typedef typename PrEnactor::Value Value; + typedef typename Problem::DataSlice DataSlice; + typedef GraphSlice GraphSlice; + typedef PRFunctor PrFunctor; + ThreadSlice *thread_data = (ThreadSlice*) thread_data_; + Problem *problem = (Problem*) thread_data->problem; + PrEnactor *enactor = (PrEnactor*) thread_data->enactor; + //util::cpu_mt::CPUBarrier + // *cpu_barrier = thread_data -> cpu_barrier; + int num_gpus = problem -> num_gpus; + int thread_num = thread_data -> thread_num; + int gpu_idx = problem -> gpu_idx [thread_num] ; + DataSlice *data_slice = problem -> data_slices [thread_num].GetPointer(util::HOST); + GraphSlice *graph_slice = problem -> graph_slices [thread_num] ; + FrontierAttribute + *frontier_attribute = &(enactor -> frontier_attribute [thread_num * num_gpus]); + EnactorStats *enactor_stats = &(enactor -> enactor_stats [thread_num * num_gpus]); + + do { + printf("CCThread entered\n");fflush(stdout); + if (enactor_stats[0].retval = util::SetDevice(gpu_idx)) break; + thread_data->stats = 1; + while (thread_data->stats !=2) sleep(0); + thread_data->stats = 3; + + for (int peer_=0; peer_ - <<>> ( - data_slice[0]->out_length[peer_], - data_slice[0]->keys_out[peer_].GetPointer(util::DEVICE), - data_slice[0]->rank_next.GetPointer(util::DEVICE), - data_slice[0]->value__associate_out[peer_][0].GetPointer(util::DEVICE)); + frontier_attribute[peer_].queue_length = peer_==0?data_slice->local_nodes : 0; + frontier_attribute[peer_].queue_index = 0; // Work queue index + frontier_attribute[peer_].selector = 0; + frontier_attribute[peer_].queue_reset = true; + enactor_stats [peer_].iteration = 0; } - frontier_attribute->selector = data_slice[0]->PR_queue_selector; - //for (peer_ = 1; peer_ < num_gpus; peer_++) + //gunrock::app::Iteration_Loop + // <0, 0, PrEnactor, PrFunctor, R0DIteration > (thread_data); + + data_slice->PR_queue_selector = frontier_attribute[0].selector; + //for (int peer_=0; peer_value__associate_out[peer_][0].GetPointer(util::DEVICE), data_slice[0]->out_length[peer_], thread_num, enactor_stats->iteration, peer_, stream); + // frontier_attribute[peer_].queue_reset = true; + // enactor_stats [peer_].iteration = 0; //} - if (enactor_stats->retval = cudaStreamSynchronize(stream)) return; - } - -}; - - /** - * @brief Enacts a page rank computing on the specified graph. - * - * @tparam AdvanceKernelPolicy Kernel policy for advance operator. - * @tparam FilterKernelPolicy Kernel policy for filter operator. - * @tparam PRProblem PR Problem type. - * - * @param[in] context CudaContext pointer for moderngpu APIs. - * @param[in] problem PRProblem object. - * @param[in] max_iteration Maximum iteration number for PR. - * @param[in] max_grid_size Max grid size for PR kernel calls. - * - * \return cudaError_t object which indicates the success of all CUDA calls. - */ - template< - typename AdvanceKernelPolicy, - typename FilterKernelPolicy, - typename PrEnactor> - static CUT_THREADPROC PRThread( - void * thread_data_) - { - typedef typename PrEnactor::Problem Problem; - typedef typename PrEnactor::SizeT SizeT; - typedef typename PrEnactor::VertexId VertexId; - typedef typename PrEnactor::Value Value; - typedef typename Problem::DataSlice DataSlice; - typedef GraphSlice GraphSlice; - typedef PRFunctor PrFunctor; - ThreadSlice *thread_data = (ThreadSlice*) thread_data_; - Problem *problem = (Problem*) thread_data->problem; - PrEnactor *enactor = (PrEnactor*) thread_data->enactor; - //util::cpu_mt::CPUBarrier - // *cpu_barrier = thread_data -> cpu_barrier; - int num_gpus = problem -> num_gpus; - int thread_num = thread_data -> thread_num; - int gpu_idx = problem -> gpu_idx [thread_num] ; - DataSlice *data_slice = problem -> data_slices [thread_num].GetPointer(util::HOST); - GraphSlice *graph_slice = problem -> graph_slices [thread_num] ; - FrontierAttribute - *frontier_attribute = &(enactor -> frontier_attribute [thread_num * num_gpus]); - EnactorStats *enactor_stats = &(enactor -> enactor_stats [thread_num * num_gpus]); - - do { - printf("CCThread entered\n");fflush(stdout); - if (enactor_stats[0].retval = util::SetDevice(gpu_idx)) break; - thread_data->stats = 1; - while (thread_data->stats !=2) sleep(0); - thread_data->stats = 3; - - for (int peer_=0; peer_local_nodes : 0; - frontier_attribute[peer_].queue_index = 0; // Work queue index - frontier_attribute[peer_].selector = 0; - frontier_attribute[peer_].queue_reset = true; - enactor_stats [peer_].iteration = 0; - } - //gunrock::app::Iteration_Loop - // <0, 0, PrEnactor, PrFunctor, R0DIteration > (thread_data); - - data_slice->PR_queue_selector = frontier_attribute[0].selector; - //for (int peer_=0; peer_ 1) - { - data_slice->value__associate_orgs[0] = data_slice->rank_next.GetPointer(util::DEVICE); - data_slice->value__associate_orgs.Move(util::HOST, util::DEVICE); - //util::cpu_mt::IncrementnWaitBarrier(cpu_barrier, thread_num); - //for (int i=0; i<4; i++) - //for (int gpu=0; gpunum_stages; stage++) - // data_slice->events_set[i][gpu][stage] = false; - //util::cpu_mt::IncrementnWaitBarrier(cpu_barrier+1, thread_num); - } - data_slice -> edge_map_queue_len = frontier_attribute[0].queue_length; - //util::cpu_mt::PrintGPUArray("degrees", data_slice->degrees.GetPointer(util::DEVICE), graph_slice->nodes, thread_num); + if (num_gpus > 1) + { + data_slice->value__associate_orgs[0] = data_slice->rank_next.GetPointer(util::DEVICE); + data_slice->value__associate_orgs.Move(util::HOST, util::DEVICE); + //util::cpu_mt::IncrementnWaitBarrier(cpu_barrier, thread_num); + //for (int i=0; i<4; i++) + //for (int gpu=0; gpunum_stages; stage++) + // data_slice->events_set[i][gpu][stage] = false; + //util::cpu_mt::IncrementnWaitBarrier(cpu_barrier+1, thread_num); + } + data_slice -> edge_map_queue_len = frontier_attribute[0].queue_length; + //util::cpu_mt::PrintGPUArray("degrees", data_slice->degrees.GetPointer(util::DEVICE), graph_slice->nodes, thread_num); - // Step through PR iterations - gunrock::app::Iteration_Loop - <0, 1, PrEnactor, PrFunctor, PRIteration > (thread_data); - - if (thread_num > 0) - { - bool over_sized = false; - if (enactor_stats->retval = Check_Size( - "values_out", data_slice->local_nodes, &data_slice->value__associate_out[1][0], over_sized, thread_num, enactor_stats->iteration, -1)) break; - if (enactor_stats->retval = Check_Size( - "keys_out", data_slice->local_nodes, &data_slice->keys_out[1], over_sized, thread_num, enactor_stats->iteration, -1)) break; - Assign_Values_PR - <<<128, 128, 0, data_slice->streams[0]>>> ( - data_slice->local_nodes, - data_slice->keys_out[0].GetPointer(util::DEVICE), - data_slice->rank_curr.GetPointer(util::DEVICE), - data_slice->value__associate_out[1][0].GetPointer(util::DEVICE)); - util::MemsetCopyVectorKernel<<<128, 128, 0, data_slice->streams[0]>>> ( - data_slice->keys_out[1].GetPointer(util::DEVICE), - data_slice->keys_out[0].GetPointer(util::DEVICE), - data_slice->local_nodes); - enactor_stats->iteration++; - PushNeibor ( - thread_num, - 0, - data_slice->local_nodes, - enactor_stats, + // Step through PR iterations + gunrock::app::Iteration_Loop + <0, 1, PrEnactor, PrFunctor, PRIteration > (thread_data); + + if (thread_num > 0) + { + bool over_sized = false; + if (enactor_stats->retval = Check_Size( + "values_out", data_slice->local_nodes, &data_slice->value__associate_out[1][0], over_sized, thread_num, enactor_stats->iteration, -1)) break; + if (enactor_stats->retval = Check_Size( + "keys_out", data_slice->local_nodes, &data_slice->keys_out[1], over_sized, thread_num, enactor_stats->iteration, -1)) break; + Assign_Values_PR + <<<128, 128, 0, data_slice->streams[0]>>> ( + data_slice->local_nodes, + data_slice->keys_out[0].GetPointer(util::DEVICE), + data_slice->rank_curr.GetPointer(util::DEVICE), + data_slice->value__associate_out[1][0].GetPointer(util::DEVICE)); + util::MemsetCopyVectorKernel<<<128, 128, 0, data_slice->streams[0]>>> ( + data_slice->keys_out[1].GetPointer(util::DEVICE), + data_slice->keys_out[0].GetPointer(util::DEVICE), + data_slice->local_nodes); + enactor_stats->iteration++; + PushNeibor ( + thread_num, + 0, + data_slice->local_nodes, + enactor_stats, problem->data_slices [thread_num].GetPointer(util::HOST), problem->data_slices [0 ].GetPointer(util::HOST), problem->graph_slices[thread_num], diff --git a/gunrock/app/sssp/sssp_app.cu b/gunrock/app/sssp/sssp_app.cu index fa55888be..c7621b8a0 100644 --- a/gunrock/app/sssp/sssp_app.cu +++ b/gunrock/app/sssp/sssp_app.cu @@ -8,21 +8,19 @@ /** * @file sssp_app.cu * - * @brief single-source shortest path problem implementation + * @brief single-source shortest path (SSSP) application */ -#include #include -// Graph construction utils +// graph construction utilities #include -// SSSP includes +// single-source shortest path includes #include #include #include -// Moderngpu include #include using namespace gunrock; @@ -38,7 +36,7 @@ using namespace gunrock::app::sssp; * @tparam SizeT * @tparam MARK_PREDECESSORS * - * @param[out] ggraph_out GunrockGraph type output + * @param[out] graph_o GRGraph type output * @param[out] predecessor return predeessor if mark_pred = true * @param[in] graph Reference to the CSR graph we process on * @param[in] source Source node where SSSP starts @@ -48,125 +46,169 @@ using namespace gunrock::app::sssp; * @param[in] delta_factor user set * @param[in] context moderngpu context */ -template < - typename VertexId, - typename Value, - typename SizeT, - bool MARK_PREDECESSORS > +template void run_sssp( - GunrockGraph *ggraph_out, - VertexId *predecessor, - const Csr &graph, - const VertexId source, + GRGraph* graph_o, + VertexId* predecessor, + const Csr& csr, + const VertexId src, const int max_grid_size, const float queue_sizing, const int num_gpus, const int delta_factor, - CudaContext& context) { - // Preparations - typedef SSSPProblem < - VertexId, - SizeT, - Value, - MARK_PREDECESSORS > Problem; - + CudaContext& context) { + typedef SSSPProblem Problem; // Allocate host-side label array for gpu-computed results - unsigned int *h_labels - = (unsigned int*)malloc(sizeof(unsigned int) * graph.nodes); + Value *h_labels = (Value*)malloc(sizeof(Value) * csr.nodes); //VertexId *h_preds = NULL; if (MARK_PREDECESSORS) { - //h_preds = (VertexId*)malloc(sizeof(VertexId) * graph.nodes); + //h_preds = (VertexId*)malloc(sizeof(VertexId) * csr.nodes); } - // Allocate SSSP enactor map - SSSPEnactor sssp_enactor(false); + SSSPEnactor enactor(false); // enactor map + Problem *problem = new Problem; + util::GRError(problem->Init(false, csr, num_gpus, delta_factor), + "SSSP Problem Initialization Failed", __FILE__, __LINE__); - // Allocate problem on GPU - Problem *csr_problem = new Problem; - util::GRError(csr_problem->Init( - false, - graph, - num_gpus, - delta_factor), - "Problem SSSP Initialization Failed", __FILE__, __LINE__); + util::GRError(problem->Reset(src, enactor.GetFrontierType(), queue_sizing), + "SSSP Problem Data Reset Failed", __FILE__, __LINE__); - // Perform SSSP - CpuTimer gpu_timer; + GpuTimer gpu_timer; float elapsed = 0.0f; gpu_timer.Start(); // start - util::GRError(csr_problem->Reset( - source, sssp_enactor.GetFrontierType(), queue_sizing), - "SSSP Problem Data Reset Failed", __FILE__, __LINE__); - gpu_timer.Start(); - util::GRError(sssp_enactor.template Enact( - context, csr_problem, source, - queue_sizing, max_grid_size), + util::GRError(enactor.template Enact( + context, problem, src, queue_sizing, max_grid_size), "SSSP Problem Enact Failed", __FILE__, __LINE__); - gpu_timer.Stop(); - float elapsed = gpu_timer.ElapsedMillis(); - // Copy out results - util::GRError(csr_problem->Extract(h_labels, predecessor), - "SSSP Problem Data Extraction Failed", __FILE__, __LINE__); + gpu_timer.Stop(); elapsed = gpu_timer.ElapsedMillis(); // elapsed time + printf(" device elapsed time: %.4f ms\n", elapsed); - // copy label_values per node to GunrockGraph output - ggraph_out->node_values = (unsigned int*)&h_labels[0]; + util::GRError(problem->Extract(h_labels, predecessor), + "SSSP Problem Data Extraction Failed", __FILE__, __LINE__); - if (csr_problem) delete csr_problem; - //if (h_labels) free(h_labels); - //if (h_preds) free(h_preds); + // copy label_values per node to GRGraph output + graph_o->node_values = (Value*)&h_labels[0]; + if (problem) { delete problem; } cudaDeviceSynchronize(); } /** * @brief dispatch function to handle data_types * - * @param[out] ggraph_out GunrockGraph type output - * @param[out] predecessor return predeessor if mark_pred = true - * @param[in] ggraph_in GunrockGraph type input graph - * @param[in] sssp_config sssp specific configurations - * @param[in] data_type sssp data_type configurations - * @param[in] context moderngpu context + * @param[out] graph_o GRGraph type output + * @param[out] predecessor Return predeessor if mark_pred = true + * @param[in] graph_i GRGraph type input graph + * @param[in] config Primitive-specific configurations + * @param[in] data_t Data type configurations + * @param[in] context ModernGPU context */ void dispatch_sssp( - GunrockGraph *ggraph_out, - void *predecessor, - const GunrockGraph *ggraph_in, - const GunrockConfig sssp_config, - const GunrockDataType data_type, - CudaContext& context) { - switch (data_type.VTXID_TYPE) { + GRGraph* graph_o, + void* predecessor, + const GRGraph* graph_i, + const GRSetup config, + const GRTypes data_t, + CudaContext& context) { + switch (data_t.VTXID_TYPE) { case VTXID_INT: { - switch (data_type.SIZET_TYPE) { + switch (data_t.SIZET_TYPE) { case SIZET_INT: { - switch (data_type.VALUE_TYPE) { - case VALUE_INT: { - // template type = - // not support yet - printf("Not Yet Support This DataType Combination.\n"); + switch (data_t.VALUE_TYPE) { + case VALUE_INT: { // template type = + Csr csr_graph(false); + csr_graph.nodes = graph_i->num_nodes; + csr_graph.edges = graph_i->num_edges; + csr_graph.row_offsets = (int*)graph_i->row_offsets; + csr_graph.column_indices = (int*)graph_i->col_indices; + csr_graph.edge_values = (int*)graph_i->edge_values; + + // sssp configurations + bool mark_pred = 0; // whether to mark predecessors + int src_node = 0; // source vertex to start + int num_gpus = 1; // number of GPUs + int delta_factor = 1; // default delta_factor = 1 + int max_grid_size = 0; // leave it up to the enactor + float max_queue_sizing = 1.0; // default maximum queue sizing + + // determine source vertex to start sssp + switch (config.src_mode) { + case randomize: { + src_node = graphio::RandomNode(csr_graph.nodes); + break; + } + case largest_degree: { + int max_deg = 0; + src_node = csr_graph.GetNodeWithHighestDegree(max_deg); + break; + } + case manually: { + src_node = config.src_node; + break; + } + default: { + src_node = 0; + break; + } + } + mark_pred = config.mark_pred; + delta_factor = config.delta_factor; + max_queue_sizing = config.queue_size; + + switch (mark_pred) { + case true: { + run_sssp( + graph_o, + (int*)predecessor, + csr_graph, + src_node, + max_grid_size, + max_queue_sizing, + num_gpus, + delta_factor, + context); + break; + } + case false: { + run_sssp( + graph_o, + (int*)predecessor, + csr_graph, + src_node, + max_grid_size, + max_queue_sizing, + num_gpus, + delta_factor, + context); + break; + } + } + // reset for free memory + csr_graph.row_offsets = NULL; + csr_graph.column_indices = NULL; + csr_graph.edge_values = NULL; break; } - case VALUE_UINT: { - // template type = + case VALUE_UINT: { // template type = // build input csr format graph Csr csr_graph(false); - csr_graph.nodes = ggraph_in->num_nodes; - csr_graph.edges = ggraph_in->num_edges; - csr_graph.row_offsets = (int*)ggraph_in->row_offsets; - csr_graph.column_indices = (int*)ggraph_in->col_indices; - csr_graph.edge_values = (unsigned int*)ggraph_in->edge_values; + csr_graph.nodes = graph_i->num_nodes; + csr_graph.edges = graph_i->num_edges; + csr_graph.row_offsets = (int*)graph_i->row_offsets; + csr_graph.column_indices = (int*)graph_i->col_indices; + csr_graph.edge_values = (unsigned int*)graph_i->edge_values; // sssp configurations - bool mark_pred = false; - int src_node = 0; //!< use whatever the specified graph-type's default is - int num_gpus = 1; //!< number of GPUs for multi-gpu enactor to use - int delta_factor = 1; //!< default delta_factor = 1 - int max_grid_size = 0; //!< maximum grid size (0: leave it up to the enactor) - float max_queue_sizing = 1.0; //!< default maximum queue sizing + bool mark_pred = 0; // whether to mark predecessors + int src_node = 0; // source vertex to start + int num_gpus = 1; // number of GPUs + int delta_factor = 1; // default delta_factor = 1 + int max_grid_size = 0; // leave it up to the enactor + float max_queue_sizing = 1.0; // default maximum queue sizing // determine source vertex to start sssp - switch (sssp_config.src_mode) { + switch (config.src_mode) { case randomize: { src_node = graphio::RandomNode(csr_graph.nodes); break; @@ -177,7 +219,7 @@ void dispatch_sssp( break; } case manually: { - src_node = sssp_config.src_node; + src_node = config.src_node; break; } default: { @@ -185,14 +227,14 @@ void dispatch_sssp( break; } } - mark_pred = sssp_config.mark_pred; - delta_factor = sssp_config.delta_factor; - max_queue_sizing = sssp_config.queue_size; + mark_pred = config.mark_pred; + delta_factor = config.delta_factor; + max_queue_sizing = config.queue_size; switch (mark_pred) { case true: { run_sssp( - ggraph_out, + graph_o, (int*)predecessor, csr_graph, src_node, @@ -205,7 +247,7 @@ void dispatch_sssp( } case false: { run_sssp( - ggraph_out, + graph_o, (int*)predecessor, csr_graph, src_node, @@ -245,32 +287,75 @@ void dispatch_sssp( * @tparam Value * @tparam SizeT * - * @param[out] ggraph_out GunrockGraph type output - * @param[out] predecessor return predeessor if mark_pred = true - * @param[in] ggraph_in GunrockGraph type input graph - * @param[in] sssp_config gunrock primitive specific configurations - * @param[in] data_type data_type configurations + * @param[out] graph_o GRGraph type output + * @param[out] predecessor Return predeessor if mark_pred = true + * @param[in] graph_i GRGraph type input graph + * @param[in] config Primitive specific configurations + * @param[in] data_t Data type configurations */ -void gunrock_sssp_func( - GunrockGraph *ggraph_out, - void *predecessor, - const GunrockGraph *ggraph_in, - const GunrockConfig sssp_config, - const GunrockDataType data_type) { - - // moderngpu preparations - int device = 0; - device = sssp_config.device; +void gunrock_sssp( + GRGraph* graph_o, + void* predecessor, + const GRGraph* graph_i, + const GRSetup config, + const GRTypes data_t) { + unsigned int device = 0; + device = config.device; ContextPtr context = mgpu::CreateCudaDevice(device); + dispatch_sssp(graph_o, predecessor, graph_i, config, data_t, *context); +} + +/* + * @brief Simple interface take in CSR arrays as input + * @param[out] distances Return shortest distance to source per nodes + * @param[in] num_nodes Number of nodes of the input graph + * @param[in] num_edges Number of edges of the input graph + * @param[in] row_offsets CSR-formatted graph input row offsets + * @param[in] col_indices CSR-formatted graph input column indices + * @param[in] source Source to begin traverse + */ +void sssp( + unsigned int* distances, + const int num_nodes, + const int num_edges, + const int* row_offsets, + const int* col_indices, + const unsigned int* edge_values, + const int source) { + printf("-------------------- setting --------------------\n"); + + struct GRTypes data_t; // primitive-specific data types + data_t.VTXID_TYPE = VTXID_INT; // integer + data_t.SIZET_TYPE = SIZET_INT; // integer + data_t.VALUE_TYPE = VALUE_UINT; // unsigned integer + + struct GRSetup config; // primitive-specific configures + config.device = 0; // setting device to run + config.src_node = source; // source vertex to begin + config.mark_pred = false; // do not mark predecessors + config.delta_factor = 32; // delta factor for delta-stepping + config.queue_size = 1.0f; // maximum queue size factor + + struct GRGraph *graph_o = (struct GRGraph*)malloc(sizeof(struct GRGraph)); + struct GRGraph *graph_i = (struct GRGraph*)malloc(sizeof(struct GRGraph)); + + graph_i->num_nodes = num_nodes; + graph_i->num_edges = num_edges; + graph_i->row_offsets = (void*)&row_offsets[0]; + graph_i->col_indices = (void*)&col_indices[0]; + graph_i->edge_values = (void*)&edge_values[0]; + + printf(" loaded %d nodes and %d edges\n", num_nodes, num_edges); + + printf("-------------------- running --------------------\n"); + gunrock_sssp(graph_o, (void*)NULL, graph_i, config, data_t); + memcpy(distances, (unsigned int*)graph_o->node_values, + num_nodes * sizeof(unsigned int)); + + if (graph_i) free(graph_i); + if (graph_o) free(graph_o); - // lunch dispatch function - dispatch_sssp( - ggraph_out, - predecessor, - ggraph_in, - sssp_config, - data_type, - *context); + printf("------------------- completed -------------------\n"); } // Leave this at the end of the file diff --git a/gunrock/app/sssp/sssp_enactor.cuh b/gunrock/app/sssp/sssp_enactor.cuh index f8d56b085..362d90477 100644 --- a/gunrock/app/sssp/sssp_enactor.cuh +++ b/gunrock/app/sssp/sssp_enactor.cuh @@ -575,28 +575,6 @@ public: { clock_t start_time = clock(); cudaError_t retval = cudaSuccess; - - /*typedef PQFunctor< - VertexId, - SizeT, - SSSPProblem> PqFunctor; - - typedef gunrock::priority_queue::PriorityQueue< - VertexId, - SizeT> NearFarPriorityQueue; - - typedef gunrock::priority_queue::KernelPolicy< - SSSPProblem, // Problem data type - 300, // CUDA_ARCH - INSTRUMENT, // INSTRUMENT - 8, // MIN_CTA_OCCUPANCY - 10> // LOG_THREADS - PriorityQueueKernelPolicy; - - NearFarPriorityQueue *pq = new NearFarPriorityQueue; - util::GRError( - pq->Init(problem->graph_slices[0]->edges, queue_sizing), - "Priority Queue SSSP Initialization Failed", __FILE__, __LINE__);*/ do { for (int gpu=0;gpunum_gpus;gpu++) diff --git a/gunrock/app/sssp/sssp_functor.cuh b/gunrock/app/sssp/sssp_functor.cuh index f406de7e5..9867a8a97 100644 --- a/gunrock/app/sssp/sssp_functor.cuh +++ b/gunrock/app/sssp/sssp_functor.cuh @@ -149,7 +149,7 @@ struct PQFunctor float delta; util::io::ModifiedLoad::Ld( delta, problem->delta); - return (delta == 0) ? weight : weight/delta; + return (delta == 0) ? weight : weight / delta; } }; diff --git a/gunrock/app/sssp/sssp_problem.cuh b/gunrock/app/sssp/sssp_problem.cuh index c11261cb1..edfbfe988 100644 --- a/gunrock/app/sssp/sssp_problem.cuh +++ b/gunrock/app/sssp/sssp_problem.cuh @@ -14,6 +14,7 @@ #pragma once +#include #include #include #include diff --git a/gunrock/app/topk/topk_app.cu b/gunrock/app/topk/topk_app.cu index 5b2855259..0e38c2fcf 100644 --- a/gunrock/app/topk/topk_app.cu +++ b/gunrock/app/topk/topk_app.cu @@ -1,20 +1,16 @@ -// ---------------------------------------------------------------- +// ---------------------------------------------------------------------------- // Gunrock -- Fast and Efficient GPU Graph Library -// ---------------------------------------------------------------- +// ---------------------------------------------------------------------------- // This source code is distributed under the terms of LICENSE.TXT // in the root directory of this source distribution. -// ---------------------------------------------------------------- +// ---------------------------------------------------------------------------- /** * @file topk_app.cu * - * @brief top k degree centralities implementation + * @brief top k degree centralities application */ -#include -#include -#include -#include #include #include #include @@ -77,7 +73,7 @@ template < typename Value, typename SizeT > void build_topk_subgraph( - GunrockGraph *subgraph, + GRGraph *subgraph, const Csr &graph_original, const Csr &graph_reversed, VertexId *node_ids, @@ -173,49 +169,32 @@ template < typename Value, typename SizeT > void run_topk( - GunrockGraph *graph_out, + GRGraph *graph_out, VertexId *node_ids, Value *in_degrees, Value *out_degrees, const Csr &graph_original, const Csr &graph_reversed, SizeT top_nodes) { - // preparations typedef TOPKProblem Problem; - TOPKEnactor topk_enactor(false); - Problem *topk_problem = new Problem; - - // reset top_nodes if necessary + TOPKEnactor enactor(false); + Problem *problem = new Problem; top_nodes = (top_nodes > graph_original.nodes) ? graph_original.nodes : top_nodes; - // initialization - util::GRError(topk_problem->Init( - false, - graph_original, - graph_reversed, - 1), + util::GRError(problem->Init(false, graph_original, graph_reversed, 1), "Problem TOPK Initialization Failed", __FILE__, __LINE__); - // reset data slices - util::GRError(topk_problem->Reset(topk_enactor.GetFrontierType()), + util::GRError(problem->Reset(enactor.GetFrontierType()), "TOPK Problem Data Reset Failed", __FILE__, __LINE__); - // launch gpu topk enactor to calculate top k nodes - util::GRError(topk_enactor.template Enact( - topk_problem, - top_nodes), + util::GRError(enactor.template Enact(problem, top_nodes), "TOPK Problem Enact Failed", __FILE__, __LINE__); - // copy out results back to cpu - util::GRError(topk_problem->Extract( - node_ids, - in_degrees, - out_degrees, - top_nodes), + util::GRError(problem->Extract(node_ids, in_degrees, out_degrees, top_nodes), "TOPK Problem Data Extraction Failed", __FILE__, __LINE__); - // build a subgraph contains only top k nodes on cpu + // build vertex-induced subgraph contains only top k nodes build_topk_subgraph( graph_out, graph_original, @@ -223,62 +202,54 @@ void run_topk( (int*)node_ids, top_nodes); - // cleanup if neccessary - if (topk_problem) { delete topk_problem; } - + if (problem) { delete problem; } cudaDeviceSynchronize(); } /** * @brief dispatch function to handle data_types * - * @param[out] ggraph_out GunrockGraph type output + * @param[out] graph_o GRGraph type output * @param[out] node_ids output top k node ids * @param[out] in_degrees output top k in-degree centralities * @param[out] out_degrees output top k out-degree centralities - * @param[in] ggraph_in GunrockGraph type input graph - * @param[in] topk_config topk specific configurations - * @param[in] data_type topk data_type configurations + * @param[in] graph_i GRGraph type input graph + * @param[in] config topk specific configurations + * @param[in] data_t topk data_t configurations */ void dispatch_topk( - GunrockGraph *ggraph_out, - void *node_ids, - void *in_degrees, - void *out_degrees, - const GunrockGraph *ggraph_in, - const GunrockConfig topk_config, - const GunrockDataType data_type) { - switch (data_type.VTXID_TYPE) { + GRGraph *graph_o, + void *node_ids, + void *in_degrees, + void *out_degrees, + const GRGraph *graph_i, + const GRSetup config, + const GRTypes data_t) { + switch (data_t.VTXID_TYPE) { case VTXID_INT: { - switch (data_type.SIZET_TYPE) { + switch (data_t.SIZET_TYPE) { case SIZET_INT: { - switch (data_type.VALUE_TYPE) { - case VALUE_INT: { - // template type = - // original graph + switch (data_t.VALUE_TYPE) { + case VALUE_INT: { // template type = Csr graph_original(false); - graph_original.nodes = ggraph_in->num_nodes; - graph_original.edges = ggraph_in->num_edges; - graph_original.row_offsets = (int*)ggraph_in->row_offsets; - graph_original.column_indices = (int*)ggraph_in->col_indices; - - // reversed graph + graph_original.nodes = graph_i->num_nodes; + graph_original.edges = graph_i->num_edges; + graph_original.row_offsets = (int*)graph_i->row_offsets; + graph_original.column_indices = (int*)graph_i->col_indices; Csr graph_reversed(false); - graph_reversed.nodes = ggraph_in->num_nodes; - graph_reversed.edges = ggraph_in->num_edges; - graph_reversed.row_offsets = (int*)ggraph_in->col_offsets; - graph_reversed.column_indices = (int*)ggraph_in->row_indices; - - //graph_original.DisplayGraph(); + graph_reversed.nodes = graph_i->num_nodes; + graph_reversed.edges = graph_i->num_edges; + graph_reversed.row_offsets = (int*)graph_i->col_offsets; + graph_reversed.column_indices = (int*)graph_i->row_indices; run_topk( - ggraph_out, + graph_o, (int*)node_ids, (int*)in_degrees, (int*)out_degrees, graph_original, graph_reversed, - topk_config.top_nodes); + config.top_nodes); // reset for free memory graph_original.row_offsets = NULL; @@ -287,13 +258,11 @@ void dispatch_topk( graph_reversed.column_indices = NULL; break; } - case VALUE_UINT: { - // template type = + case VALUE_UINT: { // template type = printf("Not Yet Support This DataType Combination.\n"); break; } - case VALUE_FLOAT: { - // template type = + case VALUE_FLOAT: { // template type = printf("Not Yet Support This DataType Combination.\n"); break; } @@ -309,32 +278,24 @@ void dispatch_topk( /* * @brief topk dispatch function base on gunrock data types * - * @param[out] ggraph_out output subgraph of topk problem + * @param[out] graph_o output subgraph of topk problem * @param[out] node_ids output top k node_ids * @param[out] in_degrees output associated centrality values * @param[out] out_degrees output associated centrality values - * @param[in] ggraph_in input graph need to process on - * @param[in] topk_config gunrock primitive specific configurations - * @param[in] data_type gunrock datatype struct + * @param[in] graph_i input graph need to process on + * @param[in] config gunrock primitive specific configurations + * @param[in] data_t gunrock data_t struct */ -void gunrock_topk_func( - GunrockGraph *ggraph_out, - void *node_ids, - void *in_degrees, - void *out_degrees, - const GunrockGraph *ggraph_in, - const GunrockConfig topk_config, - const GunrockDataType data_type) { - - // launch topk dispatch function - dispatch_topk( - ggraph_out, - node_ids, - in_degrees, - out_degrees, - ggraph_in, - topk_config, - data_type); +void gunrock_topk( + GRGraph *graph_o, + void *node_ids, + void *in_degrees, + void *out_degrees, + const GRGraph *graph_i, + const GRSetup config, + const GRTypes data_t) { + dispatch_topk(graph_o, node_ids, in_degrees, out_degrees, + graph_i, config, data_t); } // Leave this at the end of the file diff --git a/gunrock/app/vis/vis_enactor.cuh b/gunrock/app/vis/vis_enactor.cuh new file mode 100644 index 000000000..590863cb6 --- /dev/null +++ b/gunrock/app/vis/vis_enactor.cuh @@ -0,0 +1,395 @@ +// ---------------------------------------------------------------------------- +// Gunrock -- High-Performance Graph Primitives on GPU +// ---------------------------------------------------------------------------- +// This source code is distributed under the terms of LICENSE.TXT +// in the root directory of this source distribution. +// ---------------------------------------------------------------------------- + +/** + * @file vis_enactor.cuh + * @brief Primitive problem enactor for Vertex-Induced Subgraph + */ + +#pragma once + +#include +#include + +#include +#include +#include +#include + +#include +#include +#include + +namespace gunrock { +namespace app { +namespace vis { + +/** + * @brief Primitive enactor class. + * @tparam INSTRUMWENT Boolean indicate collect per-CTA clock-count statistics + */ +template +class VISEnactor : public EnactorBase { + protected: + /** + * A pinned, mapped word that the traversal kernels will signal when done + */ + volatile int *done; + int *d_done; + cudaEvent_t throttle_event; + + /** + * @brief Prepare the enactor for kernel call. + * @param[in] problem Problem object holds both graph and primitive data. + * \return cudaError_t object indicates the success of all CUDA functions. + */ + template + cudaError_t Setup(ProblemData *problem) { + typedef typename ProblemData::SizeT SizeT; + typedef typename ProblemData::VertexId VertexId; + + cudaError_t retval = cudaSuccess; + + // initialize the host-mapped "done" + if (!done) { + int flags = cudaHostAllocMapped; + + // allocate pinned memory for done + if (retval = util::GRError( + cudaHostAlloc((void**)&done, sizeof(int) * 1, flags), + "Enactor cudaHostAlloc done failed", + __FILE__, __LINE__)) return retval; + + // map done into GPU space + if (retval = util::GRError( + cudaHostGetDevicePointer((void**)&d_done, (void*) done, 0), + "Enactor cudaHostGetDevicePointer done failed", + __FILE__, __LINE__)) return retval; + + // create throttle event + if (retval = util::GRError( + cudaEventCreateWithFlags(&throttle_event, cudaEventDisableTiming), + "Enactor cudaEventCreateWithFlags throttle_event failed", + __FILE__, __LINE__)) return retval; + } + + done[0] = -1; + + // graph slice + typename ProblemData::GraphSlice *graph_slice = problem->graph_slices[0]; + // TODO: uncomment if using data_slice to store primitive-specific array + //typename ProblemData::DataSlice *data_slice = problem->data_slices[0]; + + do { + // bind row-offsets and bit-mask texture + cudaChannelFormatDesc row_offsets_desc = cudaCreateChannelDesc(); + oprtr::edge_map_forward::RowOffsetTex::ref.channelDesc = row_offsets_desc; + if (retval = util::GRError( + cudaBindTexture( + 0, + oprtr::edge_map_forward::RowOffsetTex::ref, + graph_slice->d_row_offsets, + (graph_slice->nodes + 1) * sizeof(SizeT)), + "Enactor cudaBindTexture row_offset_tex_ref failed", + __FILE__, __LINE__)) break; + } while (0); + return retval; + } + + public: + /** + * @brief Constructor + */ + explicit VISEnactor(bool DEBUG = false) : + EnactorBase(EDGE_FRONTIERS, DEBUG), done(NULL), d_done(NULL) {} + + /** + * @brief Destructor + */ + virtual ~VISEnactor() { + if (done) { + util::GRError(cudaFreeHost((void*)done), + "Enactor FreeHost done failed", __FILE__, __LINE__); + util::GRError(cudaEventDestroy(throttle_event), + "Enactor Destroy throttle_event failed", __FILE__, __LINE__); + } + } + + /** + * \addtogroup PublicInterface + * @{ + */ + + /** + * @brief Obtain statistics the primitive enacted. + * @param[out] num_iterations Number of iterations (BSP super-steps). + */ + template + void GetStatistics(VertexId &num_iterations) { + cudaThreadSynchronize(); + num_iterations = enactor_stats.iteration; + } + + /** @} */ + + /** + * @brief Enacts computing on the specified graph. + * + * @tparam AdvanceKernelPolicy Kernel policy for advance operator. + * @tparam FilterKernelPolicy Kernel policy for filter operator. + * @tparam Problem Problem type. + * + * @param[in] context CudaContext pointer for ModernGPU APIs + * @param[in] problem Problem object. + * @param[in] max_grid_size Max grid size for kernel calls. + * + * \return cudaError_t object indicates the success of all CUDA functions. + */ + template < + typename AdvanceKernelPolicy, + typename FilterKernelPolicy, + typename Problem > + cudaError_t EnactVIS( + CudaContext & context, + Problem * problem, + int max_grid_size = 0) { + typedef typename Problem::VertexId VertexId; + typedef typename Problem::Value Value; + typedef typename Problem::SizeT SizeT; + + typedef VISFunctor Functor; + + cudaError_t retval = cudaSuccess; + + do { + unsigned int *d_scanned_edges = NULL; + + fflush(stdout); + + // lazy initialization + if (retval = Setup(problem)) break; + + if (retval = EnactorBase::Setup( + max_grid_size, + AdvanceKernelPolicy::CTA_OCCUPANCY, + FilterKernelPolicy::CTA_OCCUPANCY)) + break; + + // single-gpu graph slice and data slice + typename Problem::GraphSlice *g_slice = problem->graph_slices[0]; + typename Problem::DataSlice *d_slice = problem->d_data_slices[0]; + + if (AdvanceKernelPolicy::ADVANCE_MODE == oprtr::advance::LB) { + if (retval = util::GRError( + cudaMalloc((void**)&d_scanned_edges, + g_slice->edges * sizeof(unsigned int)), + "VISProblem cudaMalloc d_scanned_edges failed", + __FILE__, __LINE__)) return retval; + } + + frontier_attribute.queue_length = g_slice->nodes; + frontier_attribute.queue_index = 0; // work queue index + frontier_attribute.selector = 0; + frontier_attribute.queue_reset = true; + + // filter: intput all vertices in graph, output selected vertices + oprtr::filter::Kernel + <<>>( + enactor_stats.iteration + 1, + frontier_attribute.queue_reset, + frontier_attribute.queue_index, + enactor_stats.num_gpus, + frontier_attribute.queue_length, + d_done, + g_slice->frontier_queues.d_keys[frontier_attribute.selector], + NULL, + g_slice->frontier_queues.d_keys[frontier_attribute.selector^1], + d_slice, + NULL, + work_progress, + g_slice->frontier_elements[frontier_attribute.selector], + g_slice->frontier_elements[frontier_attribute.selector^1], + enactor_stats.filter_kernel_stats); + + if (DEBUG && (retval = util::GRError(cudaThreadSynchronize(), + "filter::Kernel failed", __FILE__, __LINE__))) break; + cudaEventQuery(throttle_event); + + frontier_attribute.queue_index++; + frontier_attribute.selector ^= 1; + + if (retval = work_progress.GetQueueLength( + frontier_attribute.queue_index, + frontier_attribute.queue_length)) break; + if (DEBUG) { + printf("filter queue length: %lld", + (long long) frontier_attribute.queue_length); + util::DisplayDeviceResults( + problem->data_slices[0]->d_bitmask, g_slice->nodes); + printf("input queue for advance:\n"); + util::DisplayDeviceResults( + g_slice->frontier_queues.d_keys[frontier_attribute.selector], + frontier_attribute.queue_length); + } + + oprtr::advance::LaunchKernel( + NULL, + enactor_stats, + frontier_attribute, + d_slice, + (VertexId*)NULL, + (bool*)NULL, + (bool*)NULL, + d_scanned_edges, + g_slice->frontier_queues.d_keys[frontier_attribute.selector], + g_slice->frontier_queues.d_keys[frontier_attribute.selector^1], + (VertexId*)NULL, + (VertexId*)NULL, + g_slice->d_row_offsets, + g_slice->d_column_indices, + (SizeT*)NULL, + (VertexId*)NULL, + g_slice->nodes, + g_slice->edges, + this->work_progress, + context, + gunrock::oprtr::advance::V2V); + + if (DEBUG && (retval = util::GRError(cudaThreadSynchronize(), + "advance::Kernel failed", __FILE__, __LINE__))) break; + cudaEventQuery(throttle_event); + + frontier_attribute.queue_index++; + + if (DEBUG) { + if (retval = work_progress.GetQueueLength( + frontier_attribute.queue_index, + frontier_attribute.queue_length)) break; + printf("advance queue length: %lld", + (long long) frontier_attribute.queue_length); + util::DisplayDeviceResults( + g_slice->frontier_queues.d_keys[frontier_attribute.selector^1], + frontier_attribute.queue_length); + } + + // TODO: extract graph with proper format (edge list, csr, etc.) + + if (d_scanned_edges) cudaFree(d_scanned_edges); + + } while (0); + + if (DEBUG) { + printf("\nGPU Vertex-Induced Subgraph Enact Done.\n"); + } + + return retval; + } + + /** + * \addtogroup PublicInterface + * @{ + */ + + /** + * @brief Primitive enact kernel entry. + * + * @tparam Problem Problem type. @see Problem + * + * @param[in] context CudaContext pointer for ModernGPU APIs + * @param[in] problem Pointer to Problem object. + * @param[in] max_grid_size Max grid size for kernel calls. + * @param[in] traversal_mode Traversal Mode for advance operator: + * Load-balanced or Dynamic cooperative + * + * \return cudaError_t object indicates the success of all CUDA functions. + */ + template + cudaError_t Enact( + CudaContext &context, + Problem *problem, + int max_grid_size = 0, + int traversal_mode = 0) { + if (this->cuda_props.device_sm_version >= 300) { + typedef oprtr::filter::KernelPolicy < + Problem, // Problem data type + 300, // CUDA_ARCH + INSTRUMENT, // INSTRUMENT + 0, // SATURATION QUIT + true, // DEQUEUE_PROBLEM_SIZE + 8, // MIN_CTA_OCCUPANCY + 8, // LOG_THREADS + 1, // LOG_LOAD_VEC_SIZE + 0, // LOG_LOADS_PER_TILE + 5, // LOG_RAKING_THREADS + 5, // END_BITMASK_CULL + 8 > // LOG_SCHEDULE_GRANULARITY + FilterKernelPolicy; + + typedef oprtr::advance::KernelPolicy < + Problem, // Problem data type + 300, // CUDA_ARCH + INSTRUMENT, // INSTRUMENT + 1, // MIN_CTA_OCCUPANCY + 7, // LOG_THREADS + 8, // LOG_BLOCKS + 32 * 128, // LIGHT_EDGE_THRESHOLD (used for LB) + 1, // LOG_LOAD_VEC_SIZE + 0, // LOG_LOADS_PER_TILE + 5, // LOG_RAKING_THREADS + 32, // WARP_GATHER_THRESHOLD + 128 * 4, // CTA_GATHER_THRESHOLD + 7, // LOG_SCHEDULE_GRANULARITY + oprtr::advance::TWC_FORWARD > + ForwardAdvanceKernelPolicy; + + typedef oprtr::advance::KernelPolicy < + Problem, // Problem data type + 300, // CUDA_ARCH + INSTRUMENT, // INSTRUMENT + 1, // MIN_CTA_OCCUPANCY + 10, // LOG_THREADS + 8, // LOG_BLOCKS + 32 * 128, // LIGHT_EDGE_THRESHOLD (used for LB) + 1, // LOG_LOAD_VEC_SIZE + 0, // LOG_LOADS_PER_TILE + 5, // LOG_RAKING_THREADS + 32, // WARP_GATHER_THRESHOLD + 128 * 4, // CTA_GATHER_THRESHOLD + 7, // LOG_SCHEDULE_GRANULARITY + oprtr::advance::LB > + LBAdvanceKernelPolicy; + + if (traversal_mode == 0) { + return EnactVIS< + LBAdvanceKernelPolicy, FilterKernelPolicy, Problem>( + context, problem, max_grid_size); + } else { // traversal_mode == 1 + return EnactVIS< + ForwardAdvanceKernelPolicy, FilterKernelPolicy, Problem>( + context, problem, max_grid_size); + } + } + + // to reduce compile time, get rid of other architecture for now + // TODO: add all the kernel policy setting for all architectures + + printf("Not yet tuned for this architecture\n"); + return cudaErrorInvalidDeviceFunction; + } + + /** @} */ +}; + +} // namespace vis +} // namespace app +} // namespace gunrock + +// Leave this at the end of the file +// Local Variables: +// mode:c++ +// c-file-style: "NVIDIA" +// End: diff --git a/gunrock/app/vis/vis_functor.cuh b/gunrock/app/vis/vis_functor.cuh new file mode 100644 index 000000000..7611d42d0 --- /dev/null +++ b/gunrock/app/vis/vis_functor.cuh @@ -0,0 +1,108 @@ +// ---------------------------------------------------------------------------- +// Gunrock -- High-Performance Graph Primitives on GPU +// ---------------------------------------------------------------------------- +// This source code is distributed under the terms of LICENSE.TXT +// in the root directory of this source distribution. +// ---------------------------------------------------------------------------- + +/** + * @file vis_functor.cuh + * @brief Device functions for Vertex-Induced Subgraph + */ + +#pragma once + +#include +#include + +namespace gunrock { +namespace app { +namespace vis { + +/** + * @brief Structure contains device functions + * + * @tparam VertexId Type used for vertex id (e.g., uint32) + * @tparam SizeT Type used for array indexing. (e.g., uint32) + * @tparam Value Type used for calculation values (e.g., float) + * @tparam ProblemData Problem data type which contains data slice + * + */ +template +struct VISFunctor { + typedef typename ProblemData::DataSlice DataSlice; + + /** + * @brief Advance condition function + * + * @param[in] s_id Vertex Id of the edge source node + * @param[in] d_id Vertex Id of the edge destination node + * @param[in] problem Data slice object + * @param[in] e_id Output edge id + * @param[in] e_id_in Input edge id + * + * \return Whether to load the apply function for the edge and + * include the destination node in the next frontier. + */ + static __device__ __forceinline__ bool + CondEdge(VertexId s_id, VertexId d_id, DataSlice *problem, + VertexId e_id = 0, VertexId e_id_in = 0) { + return problem->d_bitmask[d_id]; + } + + /** + * @brief Advance apply function + * + * @param[in] s_id Vertex Id of the edge source node + * @param[in] d_id Vertex Id of the edge destination node + * @param[in] problem Data slice object + * @param[in] e_id Output edge id + * @param[in] e_id_in Input edge id + * + */ + static __device__ __forceinline__ void + ApplyEdge(VertexId s_id, VertexId d_id, DataSlice *problem, + VertexId e_id = 0, VertexId e_id_in = 0) { + printf("select edges: sid: %d, did: %d, eid: %d\n", s_id, d_id, e_id); + } + + /** + * @brief filter condition function + * + * @param[in] node Vertex Id + * @param[in] problem Data slice object + * @param[in] v Auxiliary value + * + * \return Whether to load the apply function for the node and + * include it in the outgoing vertex frontier. + */ + static __device__ __forceinline__ bool + CondFilter(VertexId node, DataSlice *problem, Value v = 0, SizeT nid = 0) { + return (node % 2) == 0; // TODO: USER-DEFINED FILTER CONDITION HERE + } + + /** + * @brief filter apply function + * + * @param[in] node Vertex Id + * @param[in] problem Data slice object + * @param[in] v Auxiliary value + * + */ + static __device__ __forceinline__ void + ApplyFilter(VertexId node, DataSlice *problem, Value v = 0, SizeT nid = 0) { + util::io::ModifiedStore::St( + true, problem->d_bitmask + node); + } +}; + +} // namespace vis +} // namespace app +} // namespace gunrock + +// Leave this at the end of the file +// Local Variables: +// mode:c++ +// c-file-style: "NVIDIA" +// End: diff --git a/gunrock/app/vis/vis_problem.cuh b/gunrock/app/vis/vis_problem.cuh new file mode 100644 index 000000000..85519391b --- /dev/null +++ b/gunrock/app/vis/vis_problem.cuh @@ -0,0 +1,294 @@ +// ---------------------------------------------------------------------------- +// Gunrock -- High-Performance Graph Primitives on GPU +// ---------------------------------------------------------------------------- +// This source code is distributed under the terms of LICENSE.TXT +// in the root directory of this source distribution. +// ---------------------------------------------------------------------------- + +/** + * @file vis_problem.cuh + * @brief GPU storage management structure for Vertex-Induced Subgraph + */ + +#pragma once + +#include +#include + +namespace gunrock { +namespace app { +namespace vis { + +/** + * @brief Problem structure stores device-side vectors + * @tparam _VertexId Type use as vertex id (e.g., uint32) + * @tparam _SizeT Type use for array indexing. (e.g., uint32) + * @tparam _Value Type use for computed value. + */ +template +struct VISProblem : ProblemBase<_VertexId, _SizeT, false> { + typedef _VertexId VertexId; + typedef _SizeT SizeT; + typedef _Value Value; + + static const bool MARK_PREDECESSORS = true; + static const bool ENABLE_IDEMPOTENCE = false; + + /** + * @brief Data slice structure which contains problem specific data. + */ + struct DataSlice { + // device storage arrays + VertexId *d_labels; // used for ... + bool *d_bitmask; // used for indicating if vertex is in subgraph + }; + + int num_gpus; + SizeT nodes; + SizeT edges; + + // data slices (one for each GPU) + DataSlice **data_slices; + + // putting structure on device while keeping the SoA structure + DataSlice **d_data_slices; + + // device index for each data slice + int *gpu_idx; + + /** + * @brief Default constructor + */ + VISProblem(): nodes(0), edges(0), num_gpus(0) {} + + /** + * @brief Constructor + * @param[in] stream_from_host Whether to stream data from host. + * @param[in] graph Reference to the CSR graph object we process on. + * @param[in] num_gpus Number of the GPUs used. + */ + VISProblem(bool stream_from_host, // only meaningful for single-GPU + const Csr &graph, + int num_gpus) : + num_gpus(num_gpus) { + Init(stream_from_host, graph, num_gpus); + } + + /** + * @brief Default destructor + */ + ~VISProblem() { + for (int i = 0; i < num_gpus; ++i) { + if (util::GRError( + cudaSetDevice(gpu_idx[i]), + "~Problem cudaSetDevice failed", __FILE__, __LINE__)) break; + + if (data_slices[i]->d_labels) + util::GRError(cudaFree(data_slices[i]->d_labels), + "GpuSlice cudaFree d_labels failed", __FILE__, __LINE__); + + if (data_slices[i]->d_bitmask) + util::GRError(cudaFree(data_slices[i]->d_bitmask), + "DataSlice cudaFree d_bitmask failed", __FILE__, __LINE__); + + if (d_data_slices[i]) + util::GRError(cudaFree(d_data_slices[i]), + "GpuSlice cudaFree data_slices failed", __FILE__, __LINE__); + } + if (d_data_slices) delete[] d_data_slices; + if (data_slices) delete[] data_slices; + } + + /** + * \addtogroup PublicInterface + * @{ + */ + + /** + * @brief Copy results computed on the GPU back to host-side vectors. + * @param[out] h_labels + *\return cudaError_t object indicates the success of all CUDA functions. + */ + cudaError_t Extract(VertexId *h_labels) { + cudaError_t retval = cudaSuccess; + + do { + if (num_gpus == 1) { + if (util::GRError(cudaSetDevice(gpu_idx[0]), + "Problem cudaSetDevice failed", + __FILE__, __LINE__)) break; + + if (retval = util::GRError( + cudaMemcpy(h_labels, + data_slices[0]->d_labels, + sizeof(VertexId) * nodes, + cudaMemcpyDeviceToHost), + "Problem cudaMemcpy d_labels failed", + __FILE__, __LINE__)) break; + + // TODO: code to extract other results here + + } else { + // multi-GPU extension code + } + } while (0); + + return retval; + } + + /** + * @brief Problem initialization + * + * @param[in] stream_from_host Whether to stream data from host. + * @param[in] graph Reference to the CSR graph object we process on. + * @param[in] _num_gpus Number of the GPUs used. + * + * \return cudaError_t object indicates the success of all CUDA functions. + */ + cudaError_t Init( + bool stream_from_host, // only meaningful for single-GPU + const Csr &graph, + int _num_gpus) { + num_gpus = _num_gpus; + nodes = graph.nodes; + edges = graph.edges; + VertexId *h_row_offsets = graph.row_offsets; + VertexId *h_column_indices = graph.column_indices; + + ProblemBase<_VertexId, _SizeT, false>::Init( + stream_from_host, + nodes, + edges, + h_row_offsets, + h_column_indices, + NULL, + NULL, + num_gpus); + + // no data in DataSlice needs to be copied from host + + /** + * Allocate output labels + */ + cudaError_t retval = cudaSuccess; + data_slices = new DataSlice * [num_gpus]; + d_data_slices = new DataSlice * [num_gpus]; + + do { + if (num_gpus <= 1) { + gpu_idx = (int*)malloc(sizeof(int)); + + // create a single data slice for the currently-set GPU + int gpu; + if (retval = util::GRError( + cudaGetDevice(&gpu), "Problem cudaGetDevice failed", + __FILE__, __LINE__)) break; + gpu_idx[0] = gpu; + + data_slices[0] = new DataSlice; + if (retval = util::GRError( + cudaMalloc((void**)&d_data_slices[0], sizeof(DataSlice)), + "Problem cudaMalloc d_data_slices failed", + __FILE__, __LINE__)) return retval; + + // create SoA on device + VertexId *d_labels; + if (retval = util::GRError( + cudaMalloc((void**)&d_labels, nodes * sizeof(VertexId)), + "Problem cudaMalloc d_labels failed", + __FILE__, __LINE__)) return retval; + data_slices[0]->d_labels = d_labels; + + bool *d_bitmask; + if (retval = util::GRError( + cudaMalloc((void**)&d_bitmask, nodes * sizeof(bool)), + "Problem cudaMalloc d_bitmask failed", + __FILE__, __LINE__)) return retval; + data_slices[0]->d_bitmask = d_bitmask; + util::MemsetKernel<<<128, 128>>>( + data_slices[0]->d_bitmask, (bool)false, nodes); + } + // add multi-GPU allocation code + } while (0); + + return retval; + } + + /** + * @brief Performs any initialization work needed for primitive + * @param[in] frontier_type Frontier type (i.e., edge / vertex / mixed) + * @param[in] queue_sizing Size scaling factor for work queue allocation + * \return cudaError_t object indicates the success of all CUDA functions. + */ + cudaError_t Reset( + FrontierType frontier_type, // type (i.e., edge / vertex / mixed) + double queue_sizing) { + // size scaling factor for work queue allocation (e.g., 1.0 creates + // n-element and m-element vertex and edge frontiers, respectively). + // 0.0 is unspecified. + + typedef ProblemBase<_VertexId, _SizeT, false> BaseProblem; + + // load ProblemBase Reset + BaseProblem::Reset(frontier_type, queue_sizing); + + cudaError_t retval = cudaSuccess; + + for (int gpu = 0; gpu < num_gpus; ++gpu) { + // setting device + if (retval = util::GRError( + cudaSetDevice(gpu_idx[gpu]), + "Problem cudaSetDevice failed", + __FILE__, __LINE__)) return retval; + + // allocate output labels if necessary + if (!data_slices[gpu]->d_labels) { + VertexId *d_labels; + if (retval = util::GRError( + cudaMalloc((void**)&d_labels, nodes * sizeof(VertexId)), + "Problem cudaMalloc d_labels failed", + __FILE__, __LINE__)) return retval; + data_slices[gpu]->d_labels = d_labels; + } + + util::MemsetKernel<<< 128, 128>>>( + data_slices[gpu]->d_labels, -1, nodes); + + if (!data_slices[gpu]->d_bitmask) { + bool *d_bitmask; + if (retval = util::GRError(cudaMalloc( + (void**)&d_bitmask, nodes * sizeof(bool)), + "MSTProblem cudaMalloc d_temp_value Failed", + __FILE__, __LINE__)) return retval; + data_slices[gpu]->d_bitmask = d_bitmask; + } + + if (retval = util::GRError( + cudaMemcpy(d_data_slices[gpu], + data_slices[gpu], + sizeof(DataSlice), + cudaMemcpyHostToDevice), + "Problem cudaMemcpy data_slices to d_data_slices failed", + __FILE__, __LINE__)) return retval; + } + + // TODO: fill in the initial input_queue for problem + // e.g., put every vertex in frontier queue + util::MemsetIdxKernel<<<128, 128>>>( + BaseProblem::graph_slices[0]->frontier_queues.d_keys[0], nodes); + + return retval; + } + + /** @} */ +}; + +} // namespace vis +} // namespace app +} // namespace gunrock + +// Leave this at the end of the file +// Local Variables: +// mode:c++ +// c-file-style: "NVIDIA" +// End: diff --git a/gunrock/coo.cuh b/gunrock/coo.cuh index e6b585a4c..008dec22b 100644 --- a/gunrock/coo.cuh +++ b/gunrock/coo.cuh @@ -37,8 +37,7 @@ struct Coo { Coo(VertexId row, VertexId col, Value val) : row(row), col(col), val(val) {} - void Val(Value &value) - { + void Val(Value &value) { value = val; } }; @@ -71,8 +70,7 @@ struct Coo { template bool RowFirstTupleCompare ( Coo elem1, - Coo elem2) -{ + Coo elem2) { if (elem1.row < elem2.row) { // Sort edges by source node return true; @@ -97,8 +95,7 @@ bool RowFirstTupleCompare ( template bool ColumnFirstTupleCompare ( Coo elem1, - Coo elem2) -{ + Coo elem2) { if (elem1.col < elem2.col) { // Sort edges by source node return true; diff --git a/gunrock/csr.cuh b/gunrock/csr.cuh index e73b2d5cc..3c56f2f1d 100644 --- a/gunrock/csr.cuh +++ b/gunrock/csr.cuh @@ -37,8 +37,7 @@ namespace gunrock { * the graph as a sparse matrix. */ template -struct Csr -{ +struct Csr { SizeT nodes; /**< Number of nodes in the graph. */ SizeT edges; /**< Number of edges in the graph. */ SizeT out_nodes; /**< Number of nodes which have outgoing edges. */ @@ -60,8 +59,7 @@ struct Csr * @param[in] pinned Use pinned memory for CSR data structure * (default: do not use pinned memory) */ - Csr(bool pinned = false) - { + Csr(bool pinned = false) { nodes = 0; edges = 0; average_degree = 0; @@ -82,8 +80,7 @@ struct Csr * @param[in] edges Number of edges in COO-format graph */ template - void FromScratch(SizeT nodes, SizeT edges) - { + void FromScratch(SizeT nodes, SizeT edges) { this->nodes = nodes; this->edges = edges; @@ -92,32 +89,32 @@ struct Csr // Put our graph in pinned memory int flags = cudaHostAllocMapped; if (gunrock::util::GRError( - cudaHostAlloc((void **)&row_offsets, - sizeof(SizeT) * (nodes + 1), flags), - "Csr cudaHostAlloc row_offsets failed", __FILE__, __LINE__)) + cudaHostAlloc((void **)&row_offsets, + sizeof(SizeT) * (nodes + 1), flags), + "Csr cudaHostAlloc row_offsets failed", __FILE__, __LINE__)) exit(1); if (gunrock::util::GRError( - cudaHostAlloc((void **)&column_indices, - sizeof(VertexId) * edges, flags), - "Csr cudaHostAlloc column_indices failed", - __FILE__, __LINE__)) + cudaHostAlloc((void **)&column_indices, + sizeof(VertexId) * edges, flags), + "Csr cudaHostAlloc column_indices failed", + __FILE__, __LINE__)) exit(1); if (LOAD_NODE_VALUES) { if (gunrock::util::GRError( - cudaHostAlloc((void **)&node_values, - sizeof(Value) * nodes, flags), - "Csr cudaHostAlloc node_values failed", - __FILE__, __LINE__)) + cudaHostAlloc((void **)&node_values, + sizeof(Value) * nodes, flags), + "Csr cudaHostAlloc node_values failed", + __FILE__, __LINE__)) exit(1); } if (LOAD_EDGE_VALUES) { if (gunrock::util::GRError( - cudaHostAlloc((void **)&edge_values, - sizeof(Value) * edges, flags), - "Csr cudaHostAlloc edge_values failed", - __FILE__, __LINE__)) + cudaHostAlloc((void **)&edge_values, + sizeof(Value) * edges, flags), + "Csr cudaHostAlloc edge_values failed", + __FILE__, __LINE__)) exit(1); } @@ -127,51 +124,52 @@ struct Csr row_offsets = (SizeT*) malloc(sizeof(SizeT) * (nodes + 1)); column_indices = (VertexId*) malloc(sizeof(VertexId) * edges); node_values = (LOAD_NODE_VALUES) ? - (Value*) malloc(sizeof(Value) * nodes) : NULL; + (Value*) malloc(sizeof(Value) * nodes) : NULL; edge_values = (LOAD_EDGE_VALUES) ? - (Value*) malloc(sizeof(Value) * edges) : NULL; + (Value*) malloc(sizeof(Value) * edges) : NULL; } } /** * - * @brief Store graph information into files + * @brief Store graph information into a file * */ - void WriteToFile( - char *file_name, - bool undirected, - bool reversed, - SizeT num_nodes, - SizeT num_edges, - SizeT *row_offsets, - VertexId *col_indices, - Value *edge_values = NULL) - { - printf("==> Writing into file: %s\n", file_name); - time_t mark1 = time(NULL); - - std::ofstream output(file_name); - if (output.is_open()) - { - output << num_nodes << " " << num_edges << " "; - std::copy(row_offsets, row_offsets + num_nodes + 1, - std::ostream_iterator(output, " ")); - std::copy(column_indices, column_indices + num_edges, - std::ostream_iterator(output, " ")); - if (edge_values != NULL) - { - std::copy(edge_values, edge_values + num_edges, - std::ostream_iterator(output, " ")); + void WriteToFile(char *file_name, SizeT v, SizeT e, SizeT *row, + VertexId *col, Value *edge_values = NULL) { + std::ofstream fout(file_name); + if (fout.is_open()) { + fout.write(reinterpret_cast(&v), sizeof(SizeT)); + fout.write(reinterpret_cast(&e), sizeof(SizeT)); + fout.write(reinterpret_cast(row), (v+1)*sizeof(SizeT)); + fout.write(reinterpret_cast(col), e*sizeof(VertexId)); + if (edge_values != NULL) { + fout.write(reinterpret_cast(edge_values), + e * sizeof(Value)); } - output.close(); - } else - { - std::cout << "Cannot Open The File." << std::endl; + fout.close(); } + } - time_t mark2 = time(NULL); - printf("Finished writing in %ds.\n", (int)(mark2 - mark1)); + void WriteToLigraFile(char *file_name, SizeT v, SizeT e, SizeT *row, + VertexId *col, Value *edge_values = NULL) { + char adj_name[256]; + sprintf(adj_name, "%s.adj", file_name); + printf("writing to ligra .adj file.\n"); + + std::ofstream fout3(adj_name); + if (fout3.is_open()) { + fout3 << v << " " << v << " " << e << std::endl; + for (int i = 0; i < v; ++i) + fout3 << row[i] << std::endl; + for (int i = 0; i < e; ++i) + fout3 << col[i] << std::endl; + if (edge_values != NULL) { + for (int i = 0; i < e; ++i) + fout3 << edge_values[i] << std::endl; + } + fout3.close(); + } } /** @@ -180,63 +178,36 @@ struct Csr * */ template - void FromCsr(char *f_in, bool undirected, bool reversed) - { - printf(" Reading directly from previously stored CSR arrays ...\n"); - - std::ifstream _file; - char buf[65536]; - _file.rdbuf()->pubsetbuf(buf,65536); - _file.open(f_in); - - if (_file.is_open()) - { - time_t mark1 = time(NULL); - - std::istream_iterator start(_file), end; - std::vector v(start, end); - - SizeT csr_nodes = v.at(0); - SizeT csr_edges = v.at(1); - printf("#nodes = %lld, #edges = %lld, #v = %lld\n", (long long)csr_nodes, (long long)csr_edges, (long long)v.size()); - - FromScratch(csr_nodes, csr_edges); + void FromCsr(char *f_in) { + printf(" Reading directly from stored binary CSR arrays ...\n"); + time_t mark1 = time(NULL); - std::copy(v.begin() + 2, v.begin() + 3 + csr_nodes, row_offsets); - std::copy(v.begin() + 3 + csr_nodes, - v.begin() + 3 + csr_nodes + csr_edges, - column_indices); - if(LOAD_EDGE_VALUES) - { - std::copy(v.begin() + 3 + csr_nodes + csr_edges, - v.end(), edge_values); - } + std::ifstream input(f_in); + SizeT v, e; + input.read(reinterpret_cast(&v), sizeof(SizeT)); + input.read(reinterpret_cast(&e), sizeof(SizeT)); - time_t mark2 = time(NULL); - printf("Done reading (%ds).\n", (int) (mark2 - mark1)); + FromScratch(v, e); - v.clear(); - } - else - { - perror("Unable To Open The File."); + input.read(reinterpret_cast(row_offsets), (v + 1)*sizeof(SizeT)); + input.read(reinterpret_cast(column_indices), e*sizeof(VertexId)); + if (LOAD_EDGE_VALUES) { + input.read(reinterpret_cast(edge_values), e*sizeof(Value)); } + time_t mark2 = time(NULL); + printf("Done reading (%ds).\n", (int) (mark2 - mark1)); + // compute out_nodes SizeT out_node = 0; - for (SizeT node = 0; node < nodes; node++) - { - if (row_offsets[node+1] - row_offsets[node] > 0) - { + for (SizeT node = 0; node < nodes; node++) { + if (row_offsets[node + 1] - row_offsets[node] > 0) { ++out_node; } } out_nodes = out_node; - - fflush(stdout); } - /** * @brief Build CSR graph from COO graph, sorted or unsorted * @@ -366,33 +337,20 @@ struct Csr printf("Done converting (%ds).\n", (int)(mark2 - mark1)); // Write offsets, indices, node, edges etc. into file - if (LOAD_EDGE_VALUES) - { - WriteToFile(output_file, - undirected, - reversed, - nodes, - edges, - row_offsets, - column_indices, - edge_values); + if (LOAD_EDGE_VALUES) { + WriteToFile(output_file, nodes, edges, + row_offsets, column_indices, edge_values); + //WriteToLigraFile(output_file, nodes, edges, + // row_offsets, column_indices, edge_values); } else { - WriteToFile(output_file, - undirected, - reversed, - nodes, - edges, - row_offsets, - column_indices); + WriteToFile(output_file, nodes, edges, + row_offsets, column_indices); } - fflush(stdout); - // Compute out_nodes SizeT out_node = 0; for (SizeT node = 0; node < nodes; node++) { - if (row_offsets[node+1] - row_offsets[node] > 0) - { + if (row_offsets[node + 1] - row_offsets[node] > 0) { ++out_node; } } @@ -407,8 +365,7 @@ struct Csr /** * @brief Print log-scale degree histogram of the graph. */ - void PrintHistogram() - { + void PrintHistogram() { fflush(stdout); // Initialize @@ -436,7 +393,6 @@ struct Csr } printf("\nDegree Histogram (%lld vertices, %lld edges):\n", (long long) nodes, (long long) edges); - printf(" Degree 0: %d (%.2f%%)\n", log_counts[0], (float) log_counts[0] * 100.0 / nodes); for (int i = 0; i < max_log_length + 1; i++) { @@ -451,9 +407,8 @@ struct Csr /** * @brief Display CSR graph to console */ - void DisplayGraph(bool with_edge_value = false) - { - SizeT displayed_node_num = (nodes > 40) ? 40:nodes; + void DisplayGraph(bool with_edge_value = false) { + SizeT displayed_node_num = (nodes > 40) ? 40 : nodes; printf("First %d nodes's neighbor list of the input graph:\n", displayed_node_num); for (SizeT node = 0; node < displayed_node_num; node++) { @@ -465,7 +420,7 @@ struct Csr if (edge - row_offsets[node] > 40) break; printf("["); util::PrintValue(column_indices[edge]); - if (with_edge_value) { + if (with_edge_value && edge_values != NULL) { printf(","); util::PrintValue(edge_values[edge]); } @@ -545,19 +500,19 @@ struct Csr { for (SizeT node = 0; node < nodes; ++node) { for (SizeT edge = row_offsets[node]; - edge < row_offsets[node+1]; - ++edge) { - int src_node = node; - int dst_node = column_indices[edge]; - int edge_value = edge_values[edge]; - for (SizeT r_edge = row_offsets[dst_node]; - r_edge < row_offsets[dst_node+1]; - ++r_edge) { + edge < row_offsets[node + 1]; + ++edge) { + int src_node = node; + int dst_node = column_indices[edge]; + int edge_value = edge_values[edge]; + for (SizeT r_edge = row_offsets[dst_node]; + r_edge < row_offsets[dst_node + 1]; + ++r_edge) { if (column_indices[r_edge] == src_node) { if (edge_values[r_edge] != edge_value) return false; } - } + } } } return true; @@ -566,14 +521,12 @@ struct Csr /** * @brief Find node with largest neighbor list */ - int GetNodeWithHighestDegree(int& max_degree) - { + int GetNodeWithHighestDegree(int& max_degree) { int degree = 0; int src = 0; for (SizeT node = 0; node < nodes; node++) { - if (row_offsets[node+1] - row_offsets[node] > degree) - { - degree = row_offsets[node+1]-row_offsets[node]; + if (row_offsets[node + 1] - row_offsets[node] > degree) { + degree = row_offsets[node + 1] - row_offsets[node]; src = node; } } @@ -584,16 +537,15 @@ struct Csr /** * @brief Display the neighbor list of a given node */ - void DisplayNeighborList(VertexId node) - { + void DisplayNeighborList(VertexId node) { if (node < 0 || node >= nodes) return; for (SizeT edge = row_offsets[node]; - edge < row_offsets[node + 1]; - edge++) { - util::PrintValue(column_indices[edge]); - printf(", "); - } - printf("\n"); + edge < row_offsets[node + 1]; + edge++) { + util::PrintValue(column_indices[edge]); + printf(", "); + } + printf("\n"); } /** @@ -604,7 +556,7 @@ struct Csr double mean = 0, count = 0; for (SizeT node = 0; node < nodes; ++node) { count += 1; - mean += (row_offsets[node+1]- row_offsets[node] - mean) / count; + mean += (row_offsets[node+1]-row_offsets[node]-mean)/count; } average_degree = static_cast(mean); } @@ -650,8 +602,7 @@ struct Csr /** * @brief Deallocates CSR graph */ - void Free() - { + void Free() { if (row_offsets) { if (pinned) { gunrock::util::GRError(cudaFreeHost(row_offsets), @@ -682,8 +633,7 @@ struct Csr /** * @brief CSR destructor */ - ~Csr() - { + ~Csr() { Free(); } }; diff --git a/gunrock/graphio/market.cuh b/gunrock/graphio/market.cuh index 3795d3bca..12c9a3235 100644 --- a/gunrock/graphio/market.cuh +++ b/gunrock/graphio/market.cuh @@ -56,8 +56,7 @@ int ReadMarketStream( char *output_file, Csr &csr_graph, bool undirected, - bool reversed) -{ + bool reversed) { typedef Coo EdgeTupleType; SizeT edges_read = -1; @@ -73,7 +72,7 @@ int ReadMarketStream( bool ordered_rows = true; - while(true) { + while (true) { if (fscanf(f_in, "%[^\n]\n", line) <= 0) { break; @@ -110,7 +109,7 @@ int ReadMarketStream( fflush(stdout); // Allocate coo graph - coo = (EdgeTupleType*) malloc(sizeof(EdgeTupleType) * edges); + coo = (EdgeTupleType*)malloc(sizeof(EdgeTupleType) * edges); edges_read++; @@ -122,26 +121,27 @@ int ReadMarketStream( return -1; } if (edges_read >= edges) { - fprintf(stderr, - "Error parsing MARKET graph:" - "encountered more than %d edges\n", - edges); - if (coo) free(coo); - return -1; + fprintf(stderr, + "Error parsing MARKET graph:" + "encountered more than %d edges\n", + edges); + if (coo) free(coo); + return -1; } long long ll_row, ll_col, ll_value; + // Value ll_value; // used for parse float / double int num_input; if (LOAD_VALUES) { if ((num_input = sscanf( - line, "%lld %lld %lld", - &ll_col, &ll_row, &ll_value)) < 2) { + line, "%lld %lld %lld", + &ll_col, &ll_row, &ll_value)) < 2) { fprintf(stderr, "Error parsing MARKET graph: badly formed edge\n"); if (coo) free(coo); return -1; } else if (num_input == 2) { - ll_value = 1; + ll_value = rand() % 64; } } else { if (sscanf(line, "%lld %lld", &ll_col, &ll_row) != 2) { @@ -205,7 +205,6 @@ int ReadMarketStream( undirected, reversed); free(coo); - fflush(stdout); return 0; @@ -216,13 +215,9 @@ int ReadMarketStream( * */ template -int ReadCsrArrays( - char *f_in, - Csr &csr_graph, - bool undirected, - bool reversed) -{ - csr_graph.template FromCsr(f_in, undirected, reversed); +int ReadCsrArrays(char *f_in, Csr &csr_graph, + bool undirected, bool reversed) { + csr_graph.template FromCsr(f_in); return 0; } @@ -249,34 +244,30 @@ int BuildMarketGraph( char *output_file, Csr &csr_graph, bool undirected, - bool reversed) -{ + bool reversed) { FILE *_file = fopen(output_file, "r"); - if (_file) - { + if (_file) { fclose(_file); if (ReadCsrArrays( - output_file, csr_graph, undirected, reversed) != 0) { + output_file, csr_graph, undirected, reversed) != 0) { return -1; } - } - else { + } else { if (mm_filename == NULL) { // Read from stdin printf("Reading from stdin:\n"); if (ReadMarketStream( - stdin, output_file, csr_graph, undirected, reversed) != 0) { + stdin, output_file, csr_graph, undirected, reversed) != 0) { return -1; } - } - else { + } else { // Read from file FILE *f_in = fopen(mm_filename, "r"); if (f_in) { printf("Reading from %s:\n", mm_filename); if (ReadMarketStream( - f_in, output_file, csr_graph, - undirected, reversed) != 0) { + f_in, output_file, csr_graph, + undirected, reversed) != 0) { fclose(f_in); return -1; } @@ -299,37 +290,29 @@ int BuildMarketGraph( char *file_in, Csr &graph, bool undirected, - bool reversed) -{ + bool reversed) { // seperate the graph path and the file name char *temp1 = strdup(file_in); char *temp2 = strdup(file_in); char *file_path = dirname (temp1); char *file_name = basename(temp2); - if (undirected) - { - char ud[256]; - sprintf(ud, "%s/.%s_undirected_csr", file_path, file_name); - if (BuildMarketGraph(file_in, ud, graph, true, false) != 0) + if (undirected) { + char ud[256]; // undirected graph + sprintf(ud, "%s/.%s.ud.bin", file_path, file_name); + if (BuildMarketGraph(file_in, ud, graph, true, false) != 0) return 1; - } - else if (!undirected && reversed) - { - char rv[256]; - sprintf(rv, "%s/.%s_reversed_csr", file_path, file_name); - if (BuildMarketGraph(file_in, rv, graph, false, true) != 0) + } else if (!undirected && reversed) { + char rv[256]; // reversed graph + sprintf(rv, "%s/.%s.rv.bin", file_path, file_name); + if (BuildMarketGraph(file_in, rv, graph, false, true) != 0) return 1; - } - else if (!undirected && !reversed) - { - char nr[256]; - sprintf(nr, "%s/.%s_nonreversed_csr", file_path, file_name); - if (BuildMarketGraph(file_in, nr, graph, false, false) != 0) + } else if (!undirected && !reversed) { + char di[256]; // directed graph + sprintf(di, "%s/.%s.di.bin", file_path, file_name); + if (BuildMarketGraph(file_in, di, graph, false, false) != 0) return 1; - } - else - { + } else { fprintf(stderr, "Unspecified Graph Type.\n"); } return 0; diff --git a/gunrock/gunrock.h b/gunrock/gunrock.h index ee695951f..24d8d421f 100644 --- a/gunrock/gunrock.h +++ b/gunrock/gunrock.h @@ -12,7 +12,6 @@ * The Gunrock public interface is a C-only interface to enable linking * with code written in other languages. While the internals of Gunrock * are not limited to C. - * */ #include @@ -21,129 +20,191 @@ /** * @brief VertexId data type enumerators. */ -enum VertexIdType { - VTXID_INT, //!< integer type +enum VtxIdType { + VTXID_INT, // integer type }; /** * @brief SizeT data type enumerators. */ enum SizeTType { - SIZET_INT, //!< unsigned integer type + SIZET_INT, // unsigned integer type }; /** * @brief Value data type enumerators. */ enum ValueType { - VALUE_INT, //!< integer type - VALUE_UINT, //!< unsigned int type - VALUE_FLOAT, //!< float type + VALUE_INT, // integer type + VALUE_UINT, // unsigned int type + VALUE_FLOAT, // float type }; /** * @brief data-type configuration used to specify data types */ -struct GunrockDataType { - enum VertexIdType VTXID_TYPE; //!< VertexId data-type - enum SizeTType SIZET_TYPE; //!< SizeT data-type - enum ValueType VALUE_TYPE; //!< Value data-type +struct GRTypes { + enum VtxIdType VTXID_TYPE; // VertexId data type + enum SizeTType SIZET_TYPE; // SizeT data type + enum ValueType VALUE_TYPE; // Value data type }; /** * @brief GunrockGraph as a standard graph interface */ -struct GunrockGraph { - size_t num_nodes; //!< number of nodes in graph - size_t num_edges; //!< number of edges in graph - void *row_offsets; //!< C.S.R. row offsets - void *col_indices; //!< C.S.R. column indices - void *col_offsets; //!< C.S.C. column offsets - void *row_indices; //!< C.S.C. row indices - void *node_values; //!< associated values per node - void *edge_values; //!< associated values per edge +struct GRGraph { + size_t num_nodes; // number of nodes in graph + size_t num_edges; // number of edges in graph + void *row_offsets; // CSR row offsets + void *col_indices; // CSR column indices + void *col_offsets; // CSC column offsets + void *row_indices; // CSC row indices + void *node_values; // associated values per node + void *edge_values; // associated values per edge }; /** * @brief Source Vertex Mode enumerators. */ enum SrcMode { - manually, //!< manually set up source node - randomize, //!< random generate source node - largest_degree, //!< set to largest-degree node + manually, // manually set up source node + randomize, // random generate source node + largest_degree, // set to largest-degree node }; /** * @brief arguments configuration used to specify arguments */ -struct GunrockConfig { - bool mark_pred; //!< whether to mark predecessor or not - bool idempotence; //!< whether or not to enable idempotent - int src_node; //!< source vertex define where to start - int device; //!< setting which gpu device to use - int max_iter; //!< maximum number of iterations allowed - int top_nodes; //!< k value for topk / page_rank problem - int delta_factor; //!< sssp delta-factor parameter - float delta; //!< pagerank specific value - float error; //!< pagerank specific value - float queue_size; //!< setting frontier queue size - enum SrcMode src_mode; //!< source mode rand/largest_degree +struct GRSetup { + bool mark_pred; // whether to mark predecessor or not + bool idempotence; // whether or not to enable idempotent + int src_node; // source vertex define where to start + int device; // setting which device to use + int max_iter; // maximum number of iterations allowed + int top_nodes; // k value for top k / pagerank problem + int delta_factor; // sssp delta-factor parameter + float delta; // pagerank specific value + float error; // pagerank specific value + float queue_size; // setting frontier queue size + enum SrcMode src_mode; // source mode rand/largest_degree }; #ifdef __cplusplus extern "C" { #endif -// BFS Function Define -void gunrock_bfs_func( - struct GunrockGraph *graph_out, - const struct GunrockGraph *graph_in, - struct GunrockConfig configs, - struct GunrockDataType data_type); - -// BC Function Define -void gunrock_bc_func( - struct GunrockGraph *graph_out, - const struct GunrockGraph *graph_in, - struct GunrockConfig configs, - struct GunrockDataType data_type); - -// CC Function Define -void gunrock_cc_func( - struct GunrockGraph *graph_out, - unsigned int *components, - const struct GunrockGraph *graph_in, - struct GunrockConfig configs, - struct GunrockDataType data_type); - -// SSSP Function Define -void gunrock_sssp_func( - struct GunrockGraph *graph_out, - void *predecessor, - const struct GunrockGraph *graph_in, - struct GunrockConfig congis, - struct GunrockDataType data_type); - -// PR Function Define -void gunrock_pr_func( - struct GunrockGraph *graph_out, - void *node_ids, - void *page_rank, - const struct GunrockGraph *graph_in, - struct GunrockConfig configs, - struct GunrockDataType data_type); - -// TopK Function Define -void gunrock_topk_func( - struct GunrockGraph *graph_out, - void *node_ids, - void *in_degrees, - void *out_degrees, - const struct GunrockGraph *graph_in, - struct GunrockConfig configs, - struct GunrockDataType data_type); - -// TODO: Add other algorithms +/** + * breath-first search + */ +void gunrock_bfs( + struct GRGraph* graph_o, + const struct GRGraph* graph_i, + const struct GRSetup config, + const struct GRTypes data_t); + +void bfs( + int* bfs_label, + const int num_nodes, + const int num_edges, + const int* row_offsets, + const int* col_indices, + const int source); + +/** + * betweenness centrality + */ +void gunrock_bc( + struct GRGraph* graph_o, + const struct GRGraph* graph_i, + const struct GRSetup config, + const struct GRTypes data_t); + +void bc( + float* bc_scores, + const int num_nodes, + const int num_edges, + const int* row_offsets, + const int* col_indices, + const int source); + +/** + * connected component + */ +void gunrock_cc( + struct GRGraph* graph_o, + unsigned int* components, + const struct GRGraph* graph_i, + const struct GRSetup config, + const struct GRTypes data_t); + +int cc( + int* component, + const int num_nodes, + const int num_edges, + const int* row_offsets, + const int* col_indices); + +/** + * single-source shortest path + */ +void gunrock_sssp( + struct GRGraph* graph_o, + void* predecessor, + const struct GRGraph* graph_i, + const struct GRSetup config, + const struct GRTypes data_t); + +void sssp( + unsigned int* distances, + const int num_nodes, + const int num_edges, + const int* row_offsets, + const int* col_indices, + const unsigned int* edge_values, + const int source); + +// pagerank +void gunrock_pagerank( + struct GRGraph* graph_o, + void* node_ids, + void* pagerank, + const struct GRGraph* graph_i, + const struct GRSetup config, + const struct GRTypes data_t); + +void pagerank( + int* node_ids, + float* pagerank, + const int num_nodes, + const int num_edges, + const int* row_offsets, + const int* col_indices); + +// degree centrality +void gunrock_topk( + struct GRGraph* graph_o, + void* node_ids, + void* in_degrees, + void* out_degrees, + const struct GRGraph* graph_i, + const struct GRSetup config, + const struct GRTypes data_t); + +// minimum spanning tree +void gunrock_mst( + struct GRGraph* graph_o, + const struct GRGraph* graph_i, + const struct GRSetup config, + const struct GRTypes data_t); + +void mst( + bool* edge_mask, + const int num_nodes, + const int num_edges, + const int* row_offsets, + const int* col_indices); + +// TODO(ydwu): Add other primitives #ifdef __cplusplus } diff --git a/gunrock/oprtr/edge_map_partitioned/kernel.cuh b/gunrock/oprtr/edge_map_partitioned/kernel.cuh index b9634ba8b..ba5295a81 100644 --- a/gunrock/oprtr/edge_map_partitioned/kernel.cuh +++ b/gunrock/oprtr/edge_map_partitioned/kernel.cuh @@ -564,7 +564,7 @@ struct Dispatch } // Determine work decomposition - if (blockIdx.x == 0 && threadIdx.x == 0) { + if (blockIdx.x == 0 && threadIdx.x == 0) { // obtain problem size if (queue_reset) @@ -586,10 +586,10 @@ struct Dispatch // Reset our next outgoing queue counter to zero work_progress.template StoreQueueLength(0, queue_index + 2); work_progress.template PrepResetSteal(queue_index + 1); - } + } // Barrier to protect work decomposition - __syncthreads(); + __syncthreads(); unsigned int range = input_queue_len; int tid = threadIdx.x; @@ -618,16 +618,16 @@ struct Dispatch else s_vertices[tid] = (my_id < range ? d_column_indices[d_queue[my_id]] : max_vertices); s_edge_ids[tid] = (my_id < range ? d_queue[my_id] : max_vertices); - } + } __syncthreads(); - unsigned int size = s_edges[end_id]; + unsigned int size = s_edges[end_id]; VertexId v, e, e_id; int v_index = BinarySearch(tid, s_edges); v = s_vertices[v_index]; e_id = s_edge_ids[v_index]; - int end_last = (v_index < KernelPolicy::THREADS ? s_edges[v_index] : max_vertices); + int end_last = (v_index < KernelPolicy::THREADS ? s_edges[v_index] : max_vertices); for (int i = tid; i < size; i += KernelPolicy::THREADS) { @@ -726,7 +726,7 @@ struct Dispatch } } } - } else { + } else { //v:pre, u:neighbor, outoffset:offset+i if (Functor::CondEdge(v, u, problem, lookup, e_id)) { Functor::ApplyEdge(v, u, problem, lookup, e_id); diff --git a/gunrock/util/select_utils.cuh b/gunrock/util/select_utils.cuh index 2db66ca14..1da71e8fa 100644 --- a/gunrock/util/select_utils.cuh +++ b/gunrock/util/select_utils.cuh @@ -18,136 +18,104 @@ namespace gunrock { namespace util { - /** - * \addtogroup PublicInterface - * @{ - */ - - //--------------------------------------------------------------------- - // Globals, constants and typedefs - //--------------------------------------------------------------------- - struct GreaterThan - { - int compare; - - __host__ __device__ __forceinline__ - GreaterThan(int compare) : compare(compare) { } - - __host__ __device__ __forceinline__ - bool operator()(const int &a) const { return (a > compare); } - }; - - /** - * @brief selects items from from a sequence of int keys using a - * section functor (greater-than) - * - */ - template - cudaError_t CUBSelect( - VertexId *d_input, - SizeT num_elements, - VertexId *d_output, - unsigned int *num_selected) - { - cudaError_t retval = cudaSuccess; - - /* - VertexId *input = NULL; - VertexId *output = NULL; - - if (util::GRError((retval = cudaMalloc( - &input, sizeof(VertexId)*d_num_elements)), - "CUBSelect input malloc failed", - __FILE__, __LINE__)) return retval; - if (util::GRError((retval = cudaMalloc( - &output, sizeof(VertexId)*d_num_elements)), - "CUBSelect output malloc failed", - __FILE__, __LINE__)) return retval; - - cub::DoubleBuffer d_input_buffer(d_input, input); - cub::DoubleBuffer d_output_buffer(d_output, output); - */ - - unsigned int *d_num_selected = NULL; - if (util::GRError((retval = cudaMalloc( - (void**)&d_num_selected, sizeof(unsigned int))), - "CUBSelect d_num_selected malloc failed", - __FILE__, __LINE__)) return retval; - - void *d_temp_storage = NULL; - size_t temp_storage_bytes = 0; - GreaterThan select_op(-1); - - // determine temporary device storage requirements - if (util::GRError((retval = cub::DeviceSelect::If( - d_temp_storage, - temp_storage_bytes, - d_input, - d_output, - d_num_selected, - num_elements, - select_op)), - "CUBSelect cub::DeviceSelect::If failed", - __FILE__, __LINE__)) return retval; - - // allocate temporary storage - if (util::GRError((retval = cudaMalloc( - &d_temp_storage, temp_storage_bytes)), - "CUBSelect malloc d_temp_storage failed", - __FILE__, __LINE__)) return retval; - - // run selection - if (util::GRError((retval = cub::DeviceSelect::If( - d_temp_storage, - temp_storage_bytes, - d_input, - d_output, - d_num_selected, - num_elements, - select_op)), +/** + * \addtogroup PublicInterface + * @{ + */ + +//--------------------------------------------------------------------- +// Globals, constants and typedefs +//--------------------------------------------------------------------- +struct GreaterThan +{ + int compare; + + __host__ __device__ __forceinline__ + GreaterThan(int compare) : compare(compare) { } + + __host__ __device__ __forceinline__ + bool operator()(const int &a) const { return (a > compare); } +}; + +/** + * @brief selects items from from a sequence of int keys using a + * section functor (greater-than) + * + */ +template +cudaError_t CUBSelect( + T *d_input, + SizeT num_elements, + T *d_output, + unsigned int *num_selected) +{ + cudaError_t retval = cudaSuccess; + unsigned int *d_num_selected = NULL; + + if (util::GRError( + (retval = cudaMalloc((void**)&d_num_selected, sizeof(unsigned int))), + "CUBSelect d_num_selected malloc failed", + __FILE__, __LINE__)) return retval; + + void *d_temp_storage = NULL; + size_t temp_storage_bytes = 0; + GreaterThan select_op(-1); + + // determine temporary device storage requirements + if (util::GRError( + (retval = cub::DeviceSelect::If( + d_temp_storage, + temp_storage_bytes, + d_input, + d_output, + d_num_selected, + num_elements, + select_op)), "CUBSelect cub::DeviceSelect::If failed", __FILE__, __LINE__)) return retval; - /* - // copy back output - if (util::GRError((retval = cudaMemcpy( - d_output, - d_output_buffer.Current(), - sizeof(VertexId)*(*d_num_selected), - cudaMemcpyDeviceToDevice)), - "CUBSelect copy back output failed", - __FILE__, __LINE__)) return retval; - */ - - if (util::GRError((retval = cudaMemcpy( - num_selected, - d_num_selected, - sizeof(unsigned int), - cudaMemcpyDeviceToHost)), - "CUBSelect copy back num_selected failed", - __FILE__, __LINE__)) return retval; - - // clean up - if (util::GRError((retval = cudaFree(d_temp_storage)), - "CUBSelect free d_temp_storage failed", - __FILE__, __LINE__)) return retval; - if (util::GRError((retval = cudaFree(d_num_selected)), - "CUBSelect free d_num_selected failed", - __FILE__, __LINE__)) return retval; + // allocate temporary storage + if (util::GRError( + (retval = cudaMalloc(&d_temp_storage, temp_storage_bytes)), + "CUBSelect malloc d_temp_storage failed", + __FILE__, __LINE__)) return retval; + + // run selection + if (util::GRError( + (retval = cub::DeviceSelect::If( + d_temp_storage, + temp_storage_bytes, + d_input, + d_output, + d_num_selected, + num_elements, + select_op)), + "CUBSelect cub::DeviceSelect::If failed", + __FILE__, __LINE__)) return retval; - /* - if (util::GRError((retval = cudaFree(input)), - "CUBSelect free input failed", - __FILE__, __LINE__)) return retval; - if (util::GRError((retval = cudaFree(output)), - "CUBSelect free output failed", - __FILE__, __LINE__)) return retval; - */ + if (util::GRError( + (retval = cudaMemcpy( + num_selected, + d_num_selected, + sizeof(unsigned int), + cudaMemcpyDeviceToHost)), + "CUBSelect copy back num_selected failed", + __FILE__, __LINE__)) return retval; + + // clean up + if (util::GRError( + (retval = cudaFree(d_temp_storage)), + "CUBSelect free d_temp_storage failed", + __FILE__, __LINE__)) return retval; + if (util::GRError( + (retval = cudaFree(d_num_selected)), + "CUBSelect free d_num_selected failed", + __FILE__, __LINE__)) return retval; - return retval; - } + return retval; +} - /** @} */ +/** @} */ } //util } //gunrock diff --git a/gunrock/util/test_utils.cuh b/gunrock/util/test_utils.cuh index 8e62f63cd..491b2136e 100644 --- a/gunrock/util/test_utils.cuh +++ b/gunrock/util/test_utils.cuh @@ -251,6 +251,44 @@ void DisplayDeviceResults( if (h_data) free(h_data); } +/** + * Verify the contents of a device array match those + * of a host array + */ +template +void DisplayDeviceResults( + DATATYPE *d_data, + INDEXTYPE *d_indices, + size_t num_elements, + size_t num_indices) +{ + printf("num_elements:%d\n", num_elements); + printf("num_indices:%d\n", num_indices); + // Allocate array on host + DATATYPE *h_data = (DATATYPE*) malloc(num_elements * sizeof(DATATYPE)); + INDEXTYPE *h_indices = (INDEXTYPE*) malloc(num_indices * sizeof(INDEXTYPE)); + + // Reduction data back + cudaMemcpy(h_data, d_data, sizeof(DATATYPE) * num_elements, cudaMemcpyDeviceToHost); + cudaMemcpy(h_indices, d_indices, sizeof(INDEXTYPE) * num_indices, cudaMemcpyDeviceToHost); + + // Display data + printf("\n\nData:\n"); + for (int i = 0; i < num_indices; i++) + { + PrintValue(h_indices[i]); + printf(":"); + assert(h_indices[i] < num_elements); + PrintValue(h_data[h_indices[i]]); + printf(", "); + } + printf("\n\n"); + + // Cleanup + if (h_data) free(h_data); + if (h_indices) free(h_indices); +} + /****************************************************************************** * Timing ******************************************************************************/ @@ -423,7 +461,8 @@ int CompareResults( is_right = false; } } - if (!is_right && flag == 0) + + if (!is_right) { printf("\nINCORRECT: [%lu]: ", (unsigned long) i); PrintValue(computed[i]); @@ -448,7 +487,6 @@ int CompareResults( printf("...]"); } flag += 1; - //return flag; } if (!is_right && flag > 0) flag += 1; } diff --git a/gunrock/util/test_utils.h b/gunrock/util/test_utils.h index 41c27c04b..f3625bdfe 100644 --- a/gunrock/util/test_utils.h +++ b/gunrock/util/test_utils.h @@ -19,16 +19,17 @@ #undef small // Windows is terrible for polluting macro namespace #else #include - #include #endif #include #include #include +#include #include #include #include +#include #include #include #include @@ -217,59 +218,8 @@ struct CpuTimer return (stop - start) * 1000; } -/*#elif defined(CLOCK_PROCESS_CPUTIME_ID) - - timespec start; - timespec stop; - - void Start() - { - clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &start); - } - - void Stop() - { - clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &stop); - } - - float ElapsedMillis() - { - timespec temp; - if ((stop.tv_nsec-start.tv_nsec)<0) { - temp.tv_sec = stop.tv_sec-start.tv_sec-1; - temp.tv_nsec = 1000000000+stop.tv_nsec-start.tv_nsec; - } else { - temp.tv_sec = stop.tv_sec-start.tv_sec; - temp.tv_nsec = stop.tv_nsec-start.tv_nsec; - } - return temp.tv_nsec/1000000.0; - }*/ - #else - /* - rusage start; - rusage stop; - - void Start() - { - getrusage(RUSAGE_SELF, &start); - } - - void Stop() - { - getrusage(RUSAGE_SELF, &stop); - } - - float ElapsedMillis() - { - float sec = stop.ru_utime.tv_sec - start.ru_utime.tv_sec; - float usec = stop.ru_utime.tv_usec - start.ru_utime.tv_usec; - - return (sec * 1000) + (usec / 1000); - } - */ - boost::timer::cpu_timer::cpu_timer cpu_t; void Start() diff --git a/python/betweenness_centrality.py b/python/betweenness_centrality.py new file mode 100644 index 000000000..e7978d04f --- /dev/null +++ b/python/betweenness_centrality.py @@ -0,0 +1,26 @@ +### sample python interface - betweenness centrality + +from ctypes import * + +### load gunrock shared library - libgunrock +gunrock = cdll.LoadLibrary('../../build/lib/libgunrock.so') + +### read in input CSR arrays from files +row_list = [int(x.strip()) for x in open('toy_graph/row.txt')] +col_list = [int(x.strip()) for x in open('toy_graph/col.txt')] + +### convert CSR graph inputs for gunrock input +row = pointer((c_int * len(row_list))(*row_list)) +col = pointer((c_int * len(col_list))(*col_list)) +nodes = len(row_list) - 1 +edges = len(col_list) + +### output array +scores = pointer((c_float * nodes)()) + +### call gunrock function on device +gunrock.bc(scores, nodes, edges, row, col, -1) + +### sample results +print ' node bc scores:', +for idx in range(nodes): print scores[0][idx], diff --git a/python/breath_first_search.py b/python/breath_first_search.py new file mode 100644 index 000000000..b67fe80c0 --- /dev/null +++ b/python/breath_first_search.py @@ -0,0 +1,26 @@ +### sample python interface - breath-first search + +from ctypes import * + +### load gunrock shared library - libgunrock +gunrock = cdll.LoadLibrary('../../build/lib/libgunrock.so') + +### read in input CSR arrays from files +row_list = [int(x.strip()) for x in open('toy_graph/row.txt')] +col_list = [int(x.strip()) for x in open('toy_graph/col.txt')] + +### convert CSR graph inputs for gunrock input +row = pointer((c_int * len(row_list))(*row_list)) +col = pointer((c_int * len(col_list))(*col_list)) +nodes = len(row_list) - 1 +edges = len(col_list) + +### output array +labels = pointer((c_int * nodes)()) + +### call gunrock function on device +gunrock.bfs(labels, nodes, edges, row, col, 0) + +### sample results +print ' bfs labels (depth):', +for idx in range(nodes): print labels[0][idx], diff --git a/python/connected_components.py b/python/connected_components.py new file mode 100644 index 000000000..89fd824cb --- /dev/null +++ b/python/connected_components.py @@ -0,0 +1,27 @@ +### sample python interface - connected components + +from ctypes import * + +### load gunrock shared library - libgunrock +gunrock = cdll.LoadLibrary('../../build/lib/libgunrock.so') + +### read in input CSR arrays from files +row_list = [int(x.strip()) for x in open('toy_graph/row.txt')] +col_list = [int(x.strip()) for x in open('toy_graph/col.txt')] + +### convert CSR graph inputs for gunrock input +row = pointer((c_int * len(row_list))(*row_list)) +col = pointer((c_int * len(col_list))(*col_list)) +nodes = len(row_list) - 1 +edges = len(col_list) + +### output array +labels = pointer((c_int * nodes)()) + +### call gunrock function on device +num_components = gunrock.cc(labels, nodes, edges, row, col) + +### sample results +print ' number of components: ' + str(num_components) +print ' component ids:', +for idx in range(nodes): print labels[0][idx], diff --git a/python/pagerank.py b/python/pagerank.py new file mode 100644 index 000000000..642fa2e12 --- /dev/null +++ b/python/pagerank.py @@ -0,0 +1,29 @@ +### sample python interface - pagerank + +from ctypes import * + +### load gunrock shared library - libgunrock +gunrock = cdll.LoadLibrary('../../build/lib/libgunrock.so') + +### read in input CSR arrays from files +row_list = [int(x.strip()) for x in open('toy_graph/row.txt')] +col_list = [int(x.strip()) for x in open('toy_graph/col.txt')] + +### convert CSR graph inputs for gunrock input +row = pointer((c_int * len(row_list))(*row_list)) +col = pointer((c_int * len(col_list))(*col_list)) +nodes = len(row_list) - 1 +edges = len(col_list) + +### output array +node = pointer((c_int * nodes)()) +rank = pointer((c_float * nodes)()) + +### call gunrock function on device +gunrock.pagerank(node, rank, nodes, edges, row, col) + +### sample results +print 'top page rank:' +for idx in range(nodes): + print node[0][idx], + print rank[0][idx] diff --git a/python/single_source_shortest_path.py b/python/single_source_shortest_path.py new file mode 100644 index 000000000..69edc2b39 --- /dev/null +++ b/python/single_source_shortest_path.py @@ -0,0 +1,28 @@ +### sample python interface - single-source shortest path + +from ctypes import * + +### load gunrock shared library - libgunrock +gunrock = cdll.LoadLibrary('../../build/lib/libgunrock.so') + +### read in input CSR arrays from files +row_list = [int(x.strip()) for x in open('toy_graph/row.txt')] +col_list = [int(x.strip()) for x in open('toy_graph/col.txt')] +val_list = [int(x.strip()) for x in open('toy_graph/val.txt')] + +### convert CSR graph inputs for gunrock input +row = pointer((c_int * len(row_list))(*row_list)) +col = pointer((c_int * len(col_list))(*col_list)) +val = pointer((c_uint * len(val_list))(*val_list)) +nodes = len(row_list) - 1 +edges = len(col_list) + +### output array +labels = pointer((c_uint * nodes)()) + +### call gunrock function on device +gunrock.sssp(labels, nodes, edges, row, col, val, 0) + +### sample results +print ' sssp labels (distance):', +for idx in range(nodes): print labels[0][idx], diff --git a/python/toy_graph/col.txt b/python/toy_graph/col.txt new file mode 100644 index 000000000..12c10b45e --- /dev/null +++ b/python/toy_graph/col.txt @@ -0,0 +1,26 @@ +1 +2 +3 +0 +2 +4 +0 +1 +3 +4 +5 +0 +2 +5 +6 +1 +2 +5 +6 +2 +3 +4 +6 +3 +4 +5 diff --git a/python/toy_graph/row.txt b/python/toy_graph/row.txt new file mode 100644 index 000000000..1a84c1d97 --- /dev/null +++ b/python/toy_graph/row.txt @@ -0,0 +1,8 @@ +0 +3 +6 +11 +15 +19 +23 +26 diff --git a/python/toy_graph/val.txt b/python/toy_graph/val.txt new file mode 100644 index 000000000..15282b913 --- /dev/null +++ b/python/toy_graph/val.txt @@ -0,0 +1,26 @@ +3 +4 +5 +3 +5 +7 +4 +5 +7 +8 +9 +5 +7 +10 +11 +7 +8 +11 +12 +9 +10 +11 +13 +11 +12 +13 diff --git a/shared_lib_tests/CMakeLists.txt b/shared_lib_tests/CMakeLists.txt index 3d3f638db..7d880fae6 100644 --- a/shared_lib_tests/CMakeLists.txt +++ b/shared_lib_tests/CMakeLists.txt @@ -1,20 +1,26 @@ # gunrock test rig cmake file # include_directories(${gunrock_INCLUDE_DIRS}/gunrock) -add_executable (test_topk test_topk.c) +add_executable(simple_interface_test simple_interface_test.c) +target_link_libraries(simple_interface_test gunrock) + +add_executable(test_topk test_topk.c) target_link_libraries(test_topk gunrock) -add_executable (test_bfs test_bfs.c) +add_executable(test_bfs test_bfs.c) target_link_libraries(test_bfs gunrock) -add_executable (test_bc test_bc.c) +add_executable(test_bc test_bc.c) target_link_libraries(test_bc gunrock) -add_executable (test_cc test_cc.c) +add_executable(test_cc test_cc.c) target_link_libraries(test_cc gunrock) -add_executable (test_sssp test_sssp.c) +add_executable(test_sssp test_sssp.c) target_link_libraries(test_sssp gunrock) -add_executable (test_pr test_pr.c) -target_link_libraries(test_pr gunrock) \ No newline at end of file +add_executable(test_pr test_pr.c) +target_link_libraries(test_pr gunrock) + +add_executable(test_mst test_mst.c) +target_link_libraries(test_mst gunrock) \ No newline at end of file diff --git a/shared_lib_tests/simple_interface_test.c b/shared_lib_tests/simple_interface_test.c new file mode 100644 index 000000000..66cd14c70 --- /dev/null +++ b/shared_lib_tests/simple_interface_test.c @@ -0,0 +1,83 @@ +/** + * @brief Simple test for shared library simple interface + * @file simple_interface_test.c + */ + +#include +#include + +int main(int argc, char* argv[]) { + + /////////////////////////////////////////////////////////////////////////// + // define input graph + int row_offsets[] = { + 0, 3, 6, 11, 15, 19, 23, 26}; + int col_indices[] = { + 1, 2, 3, 0, 2, 4, 0, 1, 3, 4, 5, 0, 2, + 5, 6, 1, 2, 5, 6, 2, 3, 4, 6, 3, 4, 5}; + unsigned int edge_values[] = { + 3, 4, 5, 3, 5, 7, 4, 5, 7, 8, 9, 5, 7, 10, + 11, 7, 8, 11, 12, 9, 10, 11, 13, 11, 12, 13}; + + // nodes = length of row offsets-1, edges = length of column indices + size_t num_nodes = sizeof(row_offsets) / sizeof(row_offsets[0]) - 1; + size_t num_edges = sizeof(col_indices) / sizeof(col_indices[0]); + + /////////////////////////////////////////////////////////////////////////// + // allocate host arrays to store test results + int* bfs_label = ( int*)malloc(sizeof( int) * num_nodes); + float* bc_scores = (float*)malloc(sizeof(float) * num_nodes); + int* conn_comp = ( int*)malloc(sizeof( int) * num_nodes); + unsigned int *sssp_dist = + (unsigned int*)malloc(sizeof( unsigned int) * num_nodes); + int* pr_nodes = ( int*)malloc(sizeof( int) * num_nodes); + float* pr_ranks = (float*)malloc(sizeof(float) * num_nodes); + + /////////////////////////////////////////////////////////////////////////// + printf("\n testing breath-first search ...\n"); + bfs(bfs_label, num_nodes, num_edges, row_offsets, col_indices, 0); + int node; for (node = 0; node < num_nodes; ++node) { + printf(" node: [%d] | label (depth): [%d]\n", node, bfs_label[node]); + } + + /////////////////////////////////////////////////////////////////////////// + printf("\n testing betweenness centrality ...\n"); + bc(bc_scores, num_nodes, num_edges, row_offsets, col_indices, -1); + for (node = 0; node < num_nodes; ++node) { + printf(" node: [%d] | score: [%.4f]\n", node, bc_scores[node]); + } + + /////////////////////////////////////////////////////////////////////////// + printf("\n testing connected components ...\n"); + int num_comp = cc(conn_comp, num_nodes, num_edges, row_offsets, col_indices); + printf(" total number of components: %d\n", num_comp); + for (node = 0; node < num_nodes; ++node) { + printf(" node: [%d] | component: [%d]\n", node, conn_comp[node]); + } + + /////////////////////////////////////////////////////////////////////////// + printf("\n testing single-source shortest path ...\n"); + sssp(sssp_dist, num_nodes, num_edges, row_offsets, col_indices, edge_values, 0); + for (node = 0; node < num_nodes; ++node) { + printf(" node: [%d] | component: [%d]\n", node, sssp_dist[node]); + } + + /////////////////////////////////////////////////////////////////////////// + printf("\n testing pagerank ...\n"); + pagerank(pr_nodes, pr_ranks, num_nodes, num_edges, row_offsets, col_indices); + for (node = 0; node < num_nodes; ++node) { + printf(" node: [%d] | rank: [%.4f]\n", pr_nodes[node], pr_ranks[node]); + } + + // TODO(ydwu): add other primitive tests + + // clean ups + if (bfs_label) free(bfs_label); + if (bc_scores) free(bc_scores); + if (conn_comp) free(conn_comp); + if (sssp_dist) free(sssp_dist); + if (pr_nodes) free(pr_nodes); + if (pr_ranks) free(pr_ranks); + + return 0; +} diff --git a/shared_lib_tests/test_bc.c b/shared_lib_tests/test_bc.c index 0eb4fdf0f..177585a58 100644 --- a/shared_lib_tests/test_bc.c +++ b/shared_lib_tests/test_bc.c @@ -1,76 +1,65 @@ /** * @brief BC test for shared library * @file test_bc.c - * - * set input graph, configs and call function gunrock_bc_func - * return per node label values in graph_out node_values */ #include #include -int main(int argc, char* argv[]) -{ - // define data types - struct GunrockDataType data_type; - data_type.VTXID_TYPE = VTXID_INT; - data_type.SIZET_TYPE = SIZET_INT; - data_type.VALUE_TYPE = VALUE_FLOAT; +int main(int argc, char* argv[]) { + // define data types + struct GRTypes data_t; + data_t.VTXID_TYPE = VTXID_INT; + data_t.SIZET_TYPE = SIZET_INT; + data_t.VALUE_TYPE = VALUE_FLOAT; - // bc configurations (optional) - struct GunrockConfig bc_config; - bc_config.device = 0; - bc_config.src_node = -1; //!< source vertex to begin search - bc_config.queue_size = 1.0f; - bc_config.src_mode = manually; + // bc configurations (optional) + struct GRSetup config; + config.device = 0; + config.src_node = -1; // source vertex to begin search + config.queue_size = 1.0f; + config.src_mode = manually; - // define graph (undirected graph) - size_t num_nodes = 7; - size_t num_edges = 26; - int row_offsets[8] = {0, 3, 6, 11, 15, 19, 23, 26}; - int col_indices[26] = {1, 2, 3, 0, 2, 4, 0, 1, 3, 4, 5, 0, 2, - 5, 6, 1, 2, 5, 6, 2, 3, 4, 6, 3, 4, 5}; + // define graph (undirected graph) + size_t num_nodes = 7; + size_t num_edges = 26; + int row_offsets[8] = {0, 3, 6, 11, 15, 19, 23, 26}; + int col_indices[26] = {1, 2, 3, 0, 2, 4, 0, 1, 3, 4, 5, 0, 2, + 5, 6, 1, 2, 5, 6, 2, 3, 4, 6, 3, 4, 5}; - // build graph as input - struct GunrockGraph *graph_input = - (struct GunrockGraph*)malloc(sizeof(struct GunrockGraph)); - graph_input->num_nodes = num_nodes; - graph_input->num_edges = num_edges; - graph_input->row_offsets = (void*)&row_offsets[0]; - graph_input->col_indices = (void*)&col_indices[0]; + // build graph as input + struct GRGraph *graph_i = (struct GRGraph*)malloc(sizeof(struct GRGraph)); + graph_i->num_nodes = num_nodes; + graph_i->num_edges = num_edges; + graph_i->row_offsets = (void*)&row_offsets[0]; + graph_i->col_indices = (void*)&col_indices[0]; - // malloc output graph - struct GunrockGraph *graph_output = - (struct GunrockGraph*)malloc(sizeof(struct GunrockGraph)); + // malloc output graph + struct GRGraph *graph_o = (struct GRGraph*)malloc(sizeof(struct GRGraph)); - // run bc calculations - gunrock_bc_func( - graph_output, - graph_input, - bc_config, - data_type); + // run bc calculations + gunrock_bc(graph_o, graph_i, config, data_t); - // test print - int i; - printf("Demo Outputs:\n"); - // print per node betweeness centrality values - float *bc_vals = (float*)malloc(sizeof(float) * graph_input->num_nodes); - bc_vals = (float*)graph_output->node_values; - for (i = 0; i < graph_input->num_nodes; ++i) - { - printf("Node_ID [%d] : BC[%f]\n", i, bc_vals[i]); - } - printf("\n"); - // print per edge betweeness centrality values - float *ebc_vals = (float*)malloc(sizeof(float)*graph_input->num_edges); - ebc_vals = (float*)graph_output->edge_values; - for (i = 0; i < graph_input->num_edges; ++i) - { - printf("Edge_ID [%d] : EBC[%f]\n", i, ebc_vals[i]); - } + // test print + int i; + printf("Demo Outputs:\n"); + // print per node betweeness centrality values + float *bc_vals = (float*)malloc(sizeof(float) * graph_i->num_nodes); + bc_vals = (float*)graph_o->node_values; + for (i = 0; i < graph_i->num_nodes; ++i) { + printf("Node_ID [%d] : BC[%f]\n", i, bc_vals[i]); + } + printf("\n"); + // print per edge betweeness centrality values + float *ebc_vals = (float*)malloc(sizeof(float) * graph_i->num_edges); + ebc_vals = (float*)graph_o->edge_values; + for (i = 0; i < graph_i->num_edges; ++i) { + printf("Edge_ID [%d] : EBC[%f]\n", i, ebc_vals[i]); + } - if (graph_input) { free(graph_input); } - if (graph_output) { free(graph_output); } + // clean up + if (graph_i) { free(graph_i); } + if (graph_o) { free(graph_o); } - return 0; + return 0; } diff --git a/shared_lib_tests/test_bfs.c b/shared_lib_tests/test_bfs.c index d3f57b747..11b43b2a5 100644 --- a/shared_lib_tests/test_bfs.c +++ b/shared_lib_tests/test_bfs.c @@ -1,69 +1,59 @@ /** * @brief BFS test for shared library * @file test_bfs.c - * - * set input graph, configs and call function gunrock_bfs_func - * return per node label values in graph_out node_values */ #include #include -int main(int argc, char* argv[]) -{ - // define data types - struct GunrockDataType data_type; - data_type.VTXID_TYPE = VTXID_INT; - data_type.SIZET_TYPE = SIZET_INT; - data_type.VALUE_TYPE = VALUE_INT; - - // bfs configurations (optional) - struct GunrockConfig bfs_config; - bfs_config.device = 0; - bfs_config.src_mode = randomize; - bfs_config.src_node = 1; //!< source vertex to begin search - bfs_config.mark_pred = false; //!< do not mark predecessors - bfs_config.idempotence = false; //!< wether enable idempotence - bfs_config.queue_size = 1.0f; - - // define graph - size_t num_nodes = 7; - size_t num_edges = 15; - int row_offsets[8] = {0,3,6,9,11,14,15,15}; - int col_indices[15] = {1,2,3,0,2,4,3,4,5,5,6,2,5,6,6}; - - // build graph as input - struct GunrockGraph *graph_input = - (struct GunrockGraph*)malloc(sizeof(struct GunrockGraph)); - graph_input->num_nodes = num_nodes; - graph_input->num_edges = num_edges; - graph_input->row_offsets = (void*)&row_offsets[0]; - graph_input->col_indices = (void*)&col_indices[0]; - - // malloc output graph - struct GunrockGraph *graph_output = - (struct GunrockGraph*)malloc(sizeof(struct GunrockGraph)); - - // run bfs calculations - gunrock_bfs_func( - graph_output, - graph_input, - bfs_config, - data_type); - - // test print - int i; - printf("Demo Outputs:\n"); - int *labels = (int*)malloc(sizeof(int) * graph_input->num_nodes); - labels = (int*)graph_output->node_values; - for (i = 0; i < graph_input->num_nodes; ++i) - { - printf("Node_ID [%d] : Label [%d]\n", i, labels[i]); - } - - if (graph_input) { free(graph_input); } - if (graph_output) { free(graph_output); } - if (labels) { free(labels); } - - return 0; +int main(int argc, char* argv[]) { + // define data types + struct GRTypes data_t; + data_t.VTXID_TYPE = VTXID_INT; + data_t.SIZET_TYPE = SIZET_INT; + data_t.VALUE_TYPE = VALUE_INT; + + // bfs configurations (optional) + struct GRSetup config; + config.device = 0; + config.src_mode = randomize; + config.src_node = 1; // source vertex to begin search + config.mark_pred = false; // do not mark predecessors + config.idempotence = false; // wether enable idempotence + config.queue_size = 1.0f; + + // define graph + size_t num_nodes = 7; + size_t num_edges = 15; + int row_offsets[8] = {0, 3, 6, 9, 11, 14, 15, 15}; + int col_indices[15] = {1, 2, 3, 0, 2, 4, 3, 4, 5, 5, 6, 2, 5, 6, 6}; + + // build graph as input + struct GRGraph *graph_i = (struct GRGraph*)malloc(sizeof(struct GRGraph)); + graph_i->num_nodes = num_nodes; + graph_i->num_edges = num_edges; + graph_i->row_offsets = (void*)&row_offsets[0]; + graph_i->col_indices = (void*)&col_indices[0]; + + // malloc output graph + struct GRGraph *graph_o = (struct GRGraph*)malloc(sizeof(struct GRGraph)); + + // run bfs calculations + gunrock_bfs(graph_o, graph_i, config, data_t); + + // test print + int i; + printf("Demo Outputs:\n"); + int *labels = (int*)malloc(sizeof(int) * graph_i->num_nodes); + labels = (int*)graph_o->node_values; + for (i = 0; i < graph_i->num_nodes; ++i) { + printf("Node_ID [%d] : Label [%d]\n", i, labels[i]); + } + + // clean up + if (graph_i) { free(graph_i); } + if (graph_o) { free(graph_o); } + if (labels) { free(labels); } + + return 0; } diff --git a/shared_lib_tests/test_cc.c b/shared_lib_tests/test_cc.c index a230619b9..0dbd67bc1 100644 --- a/shared_lib_tests/test_cc.c +++ b/shared_lib_tests/test_cc.c @@ -1,66 +1,55 @@ /** * @brief CC test for shared library * @file test_cc.c - * - * set input graph, configs and call function gunrock_cc_func - * return per node label values in graph_out node_values */ #include #include -int main(int argc, char* argv[]) -{ - // define data types - struct GunrockDataType data_type; - data_type.VTXID_TYPE = VTXID_INT; - data_type.SIZET_TYPE = SIZET_INT; - data_type.VALUE_TYPE = VALUE_INT; - - // connected component configurations - struct GunrockConfig configs; - configs.device = 0; - - // define graph - size_t num_nodes = 7; - size_t num_edges = 15; - int row_offsets[8] = {0,3,6,9,11,14,15,15}; - int col_indices[15] = {1,2,3,0,2,4,3,4,5,5,6,2,5,6,6}; - - // build graph as input - struct GunrockGraph *graph_input = - (struct GunrockGraph*)malloc(sizeof(struct GunrockGraph)); - graph_input->num_nodes = num_nodes; - graph_input->num_edges = num_edges; - graph_input->row_offsets = (void*)&row_offsets[0]; - graph_input->col_indices = (void*)&col_indices[0]; - - // malloc output graph - struct GunrockGraph *graph_output = - (struct GunrockGraph*)malloc(sizeof(struct GunrockGraph)); - unsigned int *components = (unsigned int*)malloc(sizeof(unsigned int)); - - // run connected component calculations - gunrock_cc_func( - graph_output, - components, - graph_input, - configs, - data_type); - - // test print - int i; - printf("Number of Components: %d\n", components[0]); - printf("Demo Outputs:\n"); - int *component_ids = (int*)malloc(sizeof(int) * graph_input->num_nodes); - component_ids = (int*)graph_output->node_values; - for (i = 0; i < graph_input->num_nodes; ++i) - { - printf("Node_ID [%d] : Component_ID [%d]\n", i, component_ids[i]); - } - - if (graph_input) { free(graph_input); } - if (graph_output) { free(graph_output); } - - return 0; +int main(int argc, char* argv[]) { + // define data types + struct GRTypes data_t; + data_t.VTXID_TYPE = VTXID_INT; + data_t.SIZET_TYPE = SIZET_INT; + data_t.VALUE_TYPE = VALUE_INT; + + // connected component configurations + struct GRSetup config; + config.device = 0; + + // define graph + size_t num_nodes = 7; + size_t num_edges = 15; + int row_offsets[8] = {0, 3, 6, 9, 11, 14, 15, 15}; + int col_indices[15] = {1, 2, 3, 0, 2, 4, 3, 4, 5, 5, 6, 2, 5, 6, 6}; + + // build graph as input + struct GRGraph *graph_i = (struct GRGraph*)malloc(sizeof(struct GRGraph)); + graph_i->num_nodes = num_nodes; + graph_i->num_edges = num_edges; + graph_i->row_offsets = (void*)&row_offsets[0]; + graph_i->col_indices = (void*)&col_indices[0]; + + // malloc output graph + struct GRGraph *graph_o = (struct GRGraph*)malloc(sizeof(struct GRGraph)); + unsigned int *components = (unsigned int*)malloc(sizeof(unsigned int)); + + // run connected component calculations + gunrock_cc(graph_o, components, graph_i, config, data_t); + + // demo test print + printf("Number of Components: %d\n", components[0]); + printf("Demo Outputs:\n"); + int *component_ids = (int*)malloc(sizeof(int) * graph_i->num_nodes); + component_ids = (int*)graph_o->node_values; + int node; + for (node = 0; node < graph_i->num_nodes; ++node) { + printf("Node_ID [%d] : Component_ID [%d]\n", node, component_ids[node]); + } + + // clean up + if (graph_i) { free(graph_i); } + if (graph_o) { free(graph_o); } + + return 0; } diff --git a/shared_lib_tests/test_mst.c b/shared_lib_tests/test_mst.c new file mode 100644 index 000000000..07fbdb11c --- /dev/null +++ b/shared_lib_tests/test_mst.c @@ -0,0 +1,57 @@ +/** + * @brief MST test for shared library + * @file test_mst.c + */ + +#include +#include + +int main(int argc, char* argv[]) { + // set problem data types + struct GRTypes data_t; + data_t.VTXID_TYPE = VTXID_INT; + data_t.VALUE_TYPE = VALUE_INT; + data_t.SIZET_TYPE = SIZET_INT; + + // configurations (optional) + struct GRSetup config; + config.device = 0; + + // tiny sample graph + size_t num_nodes = 7; + size_t num_edges = 26; + int row_offsets[8] = {0, 3, 6, 11, 15, 19, 23, 26}; + int col_indices[26] = {1, 2, 3, 0, 2, 4, 0, 1, 3, 4, 5, 0, 2, + 5, 6, 1, 2, 5, 6, 2, 3, 4, 6, 3, 4, 5}; + int edge_values[26] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}; + + // build an graph as input + struct GRGraph *graph_i = (struct GRGraph*)malloc(sizeof(struct GRGraph)); + graph_i->num_nodes = num_nodes; + graph_i->num_edges = num_edges; + graph_i->row_offsets = (void*)&row_offsets[0]; + graph_i->col_indices = (void*)&col_indices[0]; + graph_i->edge_values = (void*)&edge_values[0]; + + // malloc output graph + struct GRGraph *graph_o = (struct GRGraph*)malloc(sizeof(struct GRGraph)); + + // call minimum spanning tree + gunrock_mst(graph_o, graph_i, config, data_t); + + // demo test print + printf("Demo Outputs:\n"); + int *mst_mask = (int*)malloc(sizeof(int) * num_edges); + mst_mask = (int*)graph_o->edge_values; + int edge; + for (edge = 0; edge < num_edges; ++edge) { + printf("Edge ID [%d] : Mask [%d]\n", edge, mst_mask[edge]); + } + + // clean up + if (graph_i) { free(graph_i); } + if (graph_o) { free(graph_o); } + + return 0; +} diff --git a/shared_lib_tests/test_pr.c b/shared_lib_tests/test_pr.c index 0b8ceae0c..cb36b4df1 100644 --- a/shared_lib_tests/test_pr.c +++ b/shared_lib_tests/test_pr.c @@ -1,74 +1,62 @@ /** * @brief PR test for shared library * @file test_pr.c - * - * set input graph, configs and call function gunrock_pr_func - * return per node or per edge values in graph_out node_values */ #include #include -int main(int argc, char* argv[]) -{ - // define data types - struct GunrockDataType data_type; - data_type.VTXID_TYPE = VTXID_INT; //!< integer type vertex_ids - data_type.SIZET_TYPE = SIZET_INT; //!< integer type graph size - data_type.VALUE_TYPE = VALUE_FLOAT; //!< float type value for pr - - // pr configurations (optional) - struct GunrockConfig pr_config; - pr_config.device = 0; //!< use device 0 - pr_config.delta = 0.85f; //!< default delta value - pr_config.error = 0.01f; //!< default error threshold - pr_config.max_iter = 20; //!< maximum number of iterations - pr_config.top_nodes = 10; //!< number of top nodes - pr_config.src_node = 0; //!< source node to begin page rank - pr_config.src_mode = manually; //!< set source node manually - - // define graph (undirected graph) - size_t num_nodes = 7; - size_t num_edges = 15; - int row_offsets[8] = {0,3,6,9,11,14,15,15}; - int col_indices[15] = {1,2,3,0,2,4,3,4,5,5,6,2,5,6,6}; - - // build graph as input - struct GunrockGraph *graph_input = - (struct GunrockGraph*)malloc(sizeof(struct GunrockGraph)); - graph_input->num_nodes = num_nodes; - graph_input->num_edges = num_edges; - graph_input->row_offsets = (void*)&row_offsets[0]; - graph_input->col_indices = (void*)&col_indices[0]; - - // malloc output graph - struct GunrockGraph *graph_output = - (struct GunrockGraph*)malloc(sizeof(struct GunrockGraph)); - int *node_ids = (int*)malloc(sizeof(int) * pr_config.top_nodes); - float *page_rank = (float*)malloc(sizeof(float) * pr_config.top_nodes); - - // run pr calculations - gunrock_pr_func( - graph_output, - node_ids, - page_rank, - graph_input, - pr_config, - data_type); - - // test print - int i; - printf("Demo Outputs:\n"); - if (pr_config.top_nodes > num_nodes) pr_config.top_nodes = num_nodes; - for (i = 0; i < pr_config.top_nodes; ++i) - { - printf("Node ID [%d] : Page Rank [%f] \n", node_ids[i], page_rank[i]); - } - - if (node_ids) { free(node_ids); } - if (page_rank) { free(page_rank); } - if (graph_input) { free(graph_input); } - if (graph_output) { free(graph_output); } - - return 0; +int main(int argc, char* argv[]) { + // define data types + struct GRTypes data_t; + data_t.VTXID_TYPE = VTXID_INT; // integer type vertex_ids + data_t.SIZET_TYPE = SIZET_INT; // integer type graph size + data_t.VALUE_TYPE = VALUE_FLOAT; // float type value for pr + + // pr configurations (optional) + struct GRSetup config; + config.device = 0; // use device 0 + config.delta = 0.85f; // default delta value + config.error = 0.01f; // default error threshold + config.max_iter = 20; // maximum number of iterations + config.top_nodes = 10; // number of top nodes + config.src_node = 0; // source node to begin page rank + config.src_mode = manually; // set source node manually + + // define graph (undirected graph) + size_t num_nodes = 7; + size_t num_edges = 15; + int row_offsets[8] = {0, 3, 6, 9, 11, 14, 15, 15}; + int col_indices[15] = {1, 2, 3, 0, 2, 4, 3, 4, 5, 5, 6, 2, 5, 6, 6}; + + // build graph as input + struct GRGraph *graph_i = (struct GRGraph*)malloc(sizeof(struct GRGraph)); + graph_i->num_nodes = num_nodes; + graph_i->num_edges = num_edges; + graph_i->row_offsets = (void*)&row_offsets[0]; + graph_i->col_indices = (void*)&col_indices[0]; + + // malloc output graph + struct GRGraph *graph_o = (struct GRGraph*)malloc(sizeof(struct GRGraph)); + int *node_ids = (int*)malloc(sizeof(int) * config.top_nodes); + float *pagerank = (float*)malloc(sizeof(float) * config.top_nodes); + + // run pr calculations + gunrock_pagerank(graph_o, node_ids, pagerank, graph_i, config, data_t); + + // test print + int i; + printf("Demo Outputs:\n"); + if (config.top_nodes > num_nodes) config.top_nodes = num_nodes; + for (i = 0; i < config.top_nodes; ++i) { + printf("Node ID [%d] : Page Rank [%f] \n", node_ids[i], pagerank[i]); + } + + // clean up + if (node_ids) { free(node_ids); } + if (pagerank) { free(pagerank); } + if (graph_i) { free(graph_i); } + if (graph_o) { free(graph_o); } + + return 0; } diff --git a/shared_lib_tests/test_sssp.c b/shared_lib_tests/test_sssp.c index f4fc0fe5b..e22370a3d 100644 --- a/shared_lib_tests/test_sssp.c +++ b/shared_lib_tests/test_sssp.c @@ -1,75 +1,63 @@ /** * @brief SSSP test for shared library * @file test_sssp.c - * - * set input graph, configs and call function gunrock_sssp_func - * return per node or per edge values in graph_out node_values */ #include #include -int main(int argc, char* argv[]) -{ - // define data types - struct GunrockDataType data_type; - data_type.VTXID_TYPE = VTXID_INT; - data_type.SIZET_TYPE = SIZET_INT; - data_type.VALUE_TYPE = VALUE_UINT; - - // pr configurations (optional) - struct GunrockConfig sssp_config; - sssp_config.device = 0; - sssp_config.mark_pred = true; - sssp_config.queue_size = 1.0f; - sssp_config.delta_factor = 1; - sssp_config.src_mode = randomize; - //sssp_config.src_node = 1; - - // define graph - size_t num_nodes = 7; - size_t num_edges = 15; - - int row_offsets[8] = {0,3,6,9,11,14,15,15}; - int col_indices[15] = {1,2,3,0,2,4,3,4,5,5,6,2,5,6,6}; - unsigned int edge_values[15] = {39,6,41,51,63,17,10,44,41,13,58,43,50,59,35}; - - // build graph as input - struct GunrockGraph *graph_input = - (struct GunrockGraph*)malloc(sizeof(struct GunrockGraph)); - graph_input->num_nodes = num_nodes; - graph_input->num_edges = num_edges; - graph_input->row_offsets = (void*)&row_offsets[0]; - graph_input->col_indices = (void*)&col_indices[0]; - graph_input->edge_values = (void*)&edge_values[0]; - - // malloc output graph - struct GunrockGraph *graph_output = - (struct GunrockGraph*)malloc(sizeof(struct GunrockGraph)); - int *predecessor = (int*)malloc(sizeof(int) * num_nodes); - - // run sssp calculations - gunrock_sssp_func( - graph_output, - predecessor, - graph_input, - sssp_config, - data_type); - - // test print - int i; - printf("Demo Outputs:\n"); - int *label = (int*)malloc(sizeof(int) * num_nodes); - label = (int*)graph_output->node_values; - for (i = 0; i < num_nodes; ++i) - { - printf("Node ID [%d] : Label [%d] : Predecessor [%d]\n", - i, label[i], predecessor[i]); - } - - if (predecessor) { free(predecessor); } - if (graph_input) { free(graph_input); } - if (graph_output) { free(graph_output); } - - return 0; +int main(int argc, char* argv[]) { + // define data types + struct GRTypes data_t; + data_t.VTXID_TYPE = VTXID_INT; + data_t.SIZET_TYPE = SIZET_INT; + data_t.VALUE_TYPE = VALUE_UINT; + + // configurations (optional) + struct GRSetup config; + config.device = 0; + config.mark_pred = true; + config.queue_size = 1.0f; + config.delta_factor = 1; + config.src_mode = randomize; + + // define graph + size_t num_nodes = 7; + size_t num_edges = 15; + + int row_offsets[8] = {0, 3, 6, 9, 11, 14, 15, 15}; + int col_indices[15] = {1, 2, 3, 0, 2, 4, 3, 4, 5, 5, 6, 2, 5, 6, 6}; + unsigned int edge_values[15] = {39, 6, 41, 51, 63, 17, 10, 44, 41, 13, 58, 43, 50, 59, 35}; + + // build graph as input + struct GRGraph *graph_i = (struct GRGraph*)malloc(sizeof(struct GRGraph)); + graph_i->num_nodes = num_nodes; + graph_i->num_edges = num_edges; + graph_i->row_offsets = (void*)&row_offsets[0]; + graph_i->col_indices = (void*)&col_indices[0]; + graph_i->edge_values = (void*)&edge_values[0]; + + // malloc output graph + struct GRGraph *graph_o = (struct GRGraph*)malloc(sizeof(struct GRGraph)); + int *predecessor = (int*)malloc(sizeof(int) * num_nodes); + + // run calculations + gunrock_sssp(graph_o, predecessor, graph_i, config, data_t); + + // demo test print + printf("Demo Outputs:\n"); + int *label = (int*)malloc(sizeof(int) * num_nodes); + label = (int*)graph_o->node_values; + int node; + for (node = 0; node < num_nodes; ++node) { + printf("Node ID [%d] : Label [%d] : Predecessor [%d]\n", + node, label[node], predecessor[node]); + } + + // clean up + if (predecessor) { free(predecessor); } + if (graph_i) { free(graph_i); } + if (graph_o) { free(graph_o); } + + return 0; } diff --git a/shared_lib_tests/test_topk.c b/shared_lib_tests/test_topk.c index 1feea5e97..416fe9f7c 100644 --- a/shared_lib_tests/test_topk.c +++ b/shared_lib_tests/test_topk.c @@ -1,68 +1,65 @@ +/** + * @brief Top K test for shared library + * @file test_topk.c + */ + #include #include -int main(int argc, char* argv[]) -{ - // define data types - struct GunrockDataType data_type; - data_type.VTXID_TYPE = VTXID_INT; - data_type.SIZET_TYPE = SIZET_INT; - data_type.VALUE_TYPE = VALUE_INT; +int main(int argc, char* argv[]) { + // define data types + struct GRTypes data_t; + data_t.VTXID_TYPE = VTXID_INT; + data_t.SIZET_TYPE = SIZET_INT; + data_t.VALUE_TYPE = VALUE_INT; + + struct GRSetup config; + config.device = 0; + config.top_nodes = 3; - struct GunrockConfig topk_config; - topk_config.device = 0; - topk_config.top_nodes = 3; + // define graph (directed, reversed and non-reversed) + size_t num_nodes = 7; + size_t num_edges = 15; - // define graph (directed, reversed and non-reversed) - size_t num_nodes = 7; - size_t num_edges = 15; + int row_offsets[8] = {0, 3, 6, 9, 11, 14, 15, 15}; + int col_indices[15] = {1, 2, 3, 0, 2, 4, 3, 4, 5, 5, 6, 2, 5, 6, 6}; - int row_offsets[8] = {0,3,6,9,11,14,15,15}; - int col_indices[15] = {1,2,3,0,2,4,3,4,5,5,6,2,5,6,6}; + int col_offsets[8] = {0, 1, 2, 5, 7, 9, 12, 15}; + int row_indices[15] = {1, 0, 0, 1, 4, 0, 2, 1, 2, 2, 3, 4, 3, 4, 5}; - int col_offsets[8] = {0,1,2,5,7,9,12,15}; - int row_indices[15] = {1,0,0,1,4,0,2,1,2,2,3,4,3,4,5}; + // build graph as input + struct GRGraph *graph_i = (struct GRGraph*)malloc(sizeof(struct GRGraph)); + graph_i->num_nodes = num_nodes; + graph_i->num_edges = num_edges; + graph_i->row_offsets = (void*)&row_offsets[0]; + graph_i->col_indices = (void*)&col_indices[0]; + graph_i->col_offsets = (void*)&col_offsets[0]; + graph_i->row_indices = (void*)&row_indices[0]; - // build graph as input - struct GunrockGraph *graph_input = - (struct GunrockGraph*)malloc(sizeof(struct GunrockGraph)); - graph_input->num_nodes = num_nodes; - graph_input->num_edges = num_edges; - graph_input->row_offsets = (void*)&row_offsets[0]; - graph_input->col_indices = (void*)&col_indices[0]; - graph_input->col_offsets = (void*)&col_offsets[0]; - graph_input->row_indices = (void*)&row_indices[0]; + // malloc output result arrays + struct GRGraph *graph_o = (struct GRGraph*)malloc(sizeof(struct GRGraph)); + int *node_ids = (int*)malloc(sizeof(int) * config.top_nodes); + int *in_degrees = (int*)malloc(sizeof(int) * config.top_nodes); + int *out_degrees = (int*)malloc(sizeof(int) * config.top_nodes); - // malloc output result arrays - struct GunrockGraph *graph_output = - (struct GunrockGraph*)malloc(sizeof(struct GunrockGraph)); - int *node_ids = (int*)malloc(sizeof(int) * topk_config.top_nodes); - int *in_degrees = (int*)malloc(sizeof(int) * topk_config.top_nodes); - int *out_degrees = (int*)malloc(sizeof(int) * topk_config.top_nodes); + // run topk calculations + gunrock_topk( + graph_o, node_ids, in_degrees, out_degrees, graph_i, config, data_t); - // run topk calculations - gunrock_topk_func( - graph_output, - node_ids, - in_degrees, - out_degrees, - graph_input, - topk_config, - data_type); + // print results for check correctness + printf("Demo Outputs:\n"); + int node; + for (node = 0; node < config.top_nodes; ++node) { + printf("Node ID [%d] : in_degrees [%d] : out_degrees [%d] \n", + node_ids[node], in_degrees[node], out_degrees[node]); + } - // print results for check correctness - int i; - printf("Demo Outputs:\n"); - for (i = 0; i < topk_config.top_nodes; ++i) - { - printf("Node ID [%d] : in_degrees [%d] : out_degrees [%d] \n", - node_ids[i], in_degrees[i], out_degrees[i]); - } + // clean up + if (in_degrees) free(in_degrees); + if (out_degrees) free(out_degrees); + if (node_ids) free(node_ids); + if (graph_i) free(graph_i); + if (graph_o) free(graph_o); - if (in_degrees) free(in_degrees); - if (out_degrees) free(out_degrees); - if (node_ids) free(node_ids); - if (graph_input) free(graph_input); - if (graph_output) free(graph_output); - return 0; + return 0; } \ No newline at end of file diff --git a/simple_example/Makefile b/simple_example/Makefile index 9762bce03..5fade4d66 100644 --- a/simple_example/Makefile +++ b/simple_example/Makefile @@ -94,7 +94,7 @@ else ARCH = -m64 endif -NVCCFLAGS = -Xptxas -v -Xcudafe -\# +NVCCFLAGS = -Xptxas -v -Xcudafe -\# -lineinfo ifeq (WIN_NT, $(findstring WIN_NT, $(OSUPPER))) NVCCFLAGS += -Xcompiler /bigobj -Xcompiler /Zm500 @@ -123,15 +123,15 @@ endif # Dependency Lists #------------------------------------------------------------------------------- -DEPS = ./Makefile \ - $(wildcard ../gunrock/util/*.cuh) \ - $(wildcard ../gunrock/util/**/*.cuh) \ - $(wildcard ../gunrock/*.cuh) \ - $(wildcard ../gunrock/graphio/*.cuh) \ - $(wildcard ../gunrock/oprtr/*.cuh) \ - $(wildcard ../gunrock/oprtr/**/*.cuh) \ - $(wildcard ../gunrock/app/*.cuh) \ - $(wildcard ../gunrock/app/**/*.cuh) +DEPS = ./Makefile \ + $(wildcard ../gunrock/util/*.cuh) \ + $(wildcard ../gunrock/util/**/*.cuh) \ + $(wildcard ../gunrock/*.cuh) \ + $(wildcard ../gunrock/graphio/*.cuh) \ + $(wildcard ../gunrock/oprtr/*.cuh) \ + $(wildcard ../gunrock/oprtr/**/*.cuh) \ + $(wildcard ../gunrock/app/*.cuh) \ + $(wildcard ../gunrock/app/**/*.cuh) #------------------------------------------------------------------------------- # (make simple) Simple example driver for three primitives: CC, BFS and BC @@ -139,9 +139,9 @@ DEPS = ./Makefile \ simple: bin/simple_example_$(NVCC_VERSION)_$(ARCH_SUFFIX) -bin/simple_example_$(NVCC_VERSION)_$(ARCH_SUFFIX) : simple_example.cu ../externals/moderngpu/src/mgpucontext.cu ../externals/moderngpu/src/mgpuutil.cpp $(DEPS) +bin/simple_example_$(NVCC_VERSION)_$(ARCH_SUFFIX) : simple_example.cu cpu_graph_lib.cpp ../gunrock/util/error_utils.cu ../externals/moderngpu/src/mgpucontext.cu ../externals/moderngpu/src/mgpuutil.cpp $(DEPS) mkdir -p bin - $(NVCC) $(DEFINES) $(SM_TARGETS) -o bin/simple_example_$(NVCC_VERSION)_$(ARCH_SUFFIX) simple_example.cu ../externals/moderngpu/src/mgpucontext.cu ../externals/moderngpu/src/mgpuutil.cpp $(NVCCFLAGS) $(ARCH) $(INC) -lcuda -O3 + $(NVCC) $(DEFINES) $(SM_TARGETS) -o bin/simple_example_$(NVCC_VERSION)_$(ARCH_SUFFIX) simple_example.cu cpu_graph_lib.cpp ../gunrock/util/error_utils.cu ../externals/moderngpu/src/mgpucontext.cu ../externals/moderngpu/src/mgpuutil.cpp $(NVCCFLAGS) $(ARCH) $(INC) -lcuda -O3 #------------------------------------------------------------------------------- # Clean diff --git a/tests/bc/test_bc.cu b/tests/bc/test_bc.cu index 91ceb9975..103aa4325 100644 --- a/tests/bc/test_bc.cu +++ b/tests/bc/test_bc.cu @@ -253,7 +253,7 @@ void RefCPUBC( for (idx = 0; idx < graph.edges; ++idx) { //std::cout << coo[idx].row << "," << coo[idx].col << ":" << coo[idx].val << std::endl; - ebc_values[idx] = coo[idx].val; + //ebc_values[idx] = coo[idx].val; } printf("CPU BC finished in %lf msec.", elapsed); @@ -315,9 +315,12 @@ void RefCPUBC( for (int iter = search_depth - 2; iter > 0; --iter) { + + int cur_level = 0; for (int node = 0; node < graph.nodes; ++node) { if (source_path[node] == iter) { + ++cur_level; int edges_begin = graph.row_offsets[node]; int edges_end = graph.row_offsets[node+1]; @@ -334,9 +337,7 @@ void RefCPUBC( } for (int i = 0; i < graph.nodes; ++i) - { bc_values[i] *= 0.5f; - } cpu_timer.Stop(); float elapsed = cpu_timer.ElapsedMillis(); @@ -847,6 +848,7 @@ int main( int argc, char** argv) return 1; } + csr.PrintHistogram(); } else if (graph_type == "rmat") { diff --git a/tests/hits/CMakeLists.txt b/tests/hits/CMakeLists.txt index 2ba54cb95..ef9e22ff5 100644 --- a/tests/hits/CMakeLists.txt +++ b/tests/hits/CMakeLists.txt @@ -12,7 +12,7 @@ set (mgpu_SOURCE_FILES ${mgpu_SOURCE_DIRS}/mgpucontext.cu ${mgpu_SOURCE_DIRS}/mgpuutil.cpp) -CUDA_ADD_EXECUTABLE(hyperlink_induced_topic_search +CUDA_ADD_EXECUTABLE(HITS test_hits.cu ${CMAKE_SOURCE_DIR}/gunrock/util/test_utils.cu ${CMAKE_SOURCE_DIR}/gunrock/util/error_utils.cu diff --git a/tests/mst/run.sh b/tests/mst/run.sh new file mode 100644 index 000000000..694a12fc0 --- /dev/null +++ b/tests/mst/run.sh @@ -0,0 +1,36 @@ +#!/bin/sh + +OPTION="--quick" + +# --quick running without CPU reference algorithm, if you want to test CPU +# reference algorithm, delete $OPTION2 in some lines. Warning: for large +# data this can take a long time. + +# get all execution files in ./bin +files=(./bin/*) + +# split file names into arr +arr=$(echo $files | tr " " "\n") +max_ver_num="$" +exe_file=${arr[0]} + +# iterate over all file names to get the largest version number +for x in $arr +do + output=$(grep -o "[0-9]\.[0-9]" <<<"$x") + if [ "$output" \> "$max_ver_num" ]; then + exe_file=$x + fi +done + +# put OS and Device here +SUFFIX="ubuntu12.04.k40c" + +mkdir -p eval/$SUFFIX + +for i in belgium_osm coAuthorsDBLP delaunay_n13 delaunay_n21 +do + echo $exe_file market ../../dataset/large/$i/$i.mtx $OPTION + $exe_file market ../../dataset/large/$i/$i.mtx $OPTION > eval/$SUFFIX/$i.$SUFFIX.txt + sleep 1 +done diff --git a/tests/mst/test_mst.cu b/tests/mst/test_mst.cu index 6083d9bd5..7c9eb5768 100644 --- a/tests/mst/test_mst.cu +++ b/tests/mst/test_mst.cu @@ -28,6 +28,7 @@ #include // MST includes +#include #include #include #include @@ -83,6 +84,11 @@ void Usage() /** * @brief Displays the MST result * + * @tparam VertexId + * @tparam Value + * @tparam SizeT + * + * @param[in] graph reference to the CSR graph we process on */ //////////////////////////////////////////////////////////////////////////////// template @@ -122,6 +128,25 @@ void DisplaySolution(const Csr &graph, int *mst_output) if (source) { delete [] source; } } +/** + * @brief A simple connnectivity check utility + * + * @tparam VertexId + * @tparam Value + * @tparam SizeT + * + * @param[in] graph reference to the CSR graph we process on + */ +template +bool IsConnected(const Csr & graph) +{ + GRGraph *temp = (GRGraph*)malloc(sizeof(GRGraph)); + unsigned int *components = (unsigned int*)malloc(sizeof(unsigned int)); + run_cc(temp, components, graph, 0, 1); + if (temp) free(temp); + return *components == 1; +} + /** * @brief A simple CPU-based reference MST implementation. * @@ -136,18 +161,18 @@ void DisplaySolution(const Csr &graph, int *mst_output) */ //////////////////////////////////////////////////////////////////////////////// template -long long int SimpleReferenceMST( +Value SimpleReferenceMST( const Value *edge_values, const Csr &graph) { - printf("\nREFERENCE TEST\n"); + printf("\nMST CPU REFERENCE TEST\n"); // Kruskal minimum spanning tree preparations using namespace boost; - typedef adjacency_list < vecS, vecS, undirectedS, - no_property, property < edge_weight_t, int > > Graph; + typedef adjacency_list< vecS, vecS, undirectedS, + no_property, property > Graph; typedef graph_traits < Graph >::edge_descriptor Edge; typedef graph_traits < Graph >::vertex_descriptor Vertex; - typedef std::pair E; + typedef std::pair E; E *edge_pairs = new E[graph.edges]; int idx = 0; @@ -165,16 +190,18 @@ long long int SimpleReferenceMST( CpuTimer cpu_timer; // record the kernel running time cpu_timer.Start(); + // compute reference using kruskal_min_spanning_tree algorithm kruskal_minimum_spanning_tree(g, std::back_inserter(spanning_tree)); + cpu_timer.Stop(); float elapsed_cpu = cpu_timer.ElapsedMillis(); // analyze reference results - SizeT num_selected_cpu = 0; - long long int total_weight_cpu = 0; + SizeT num_selected_cpu = 0; + Value total_weight_cpu = 0; - if (graph.nodes <= 50) printf("CPU Minimum Spanning Tree\n"); + if (graph.nodes <= 50) { printf("CPU Minimum Spanning Tree\n"); } for (std::vector < Edge >::iterator ei = spanning_tree.begin(); ei != spanning_tree.end(); ++ei) { @@ -182,7 +209,7 @@ long long int SimpleReferenceMST( { // print the edge pairs in the minimum spanning tree printf("%ld %ld\n", source(*ei, g), target(*ei, g)); - // printf(" with weight of %d\n", weight[*ei]); + // printf(" with weight of %f\n", weight[*ei]); } ++num_selected_cpu; total_weight_cpu += weight[*ei]; @@ -214,10 +241,10 @@ long long int SimpleReferenceMST( //////////////////////////////////////////////////////////////////////////////// template void RunTests( - const Csr &graph, + const Csr & graph, int max_grid_size, int num_gpus, - mgpu::CudaContext& context) + mgpu::CudaContext & context) { printf("\nMINIMUM SPANNING TREE TEST\n"); @@ -243,7 +270,7 @@ void RunTests( "MST Problem Data Reset Failed", __FILE__, __LINE__); // perform MST - GpuTimer gpu_timer; // record the kernel running time + GpuTimer gpu_timer; // record the kernel running time gpu_timer.Start(); @@ -261,7 +288,7 @@ void RunTests( util::GRError(mst_problem->Extract(h_mst_output), "MST Problem Data Extraction Failed", __FILE__, __LINE__); - if (!g_quick) // run CPU reference test + if (!g_quick) // run CPU reference test { // calculate GPU final number of selected edges int num_selected_gpu = 0; @@ -272,27 +299,27 @@ void RunTests( // printf("\nGPU - Number of Edges in MST: %d\n", num_selected_gpu); // calculate GPU total selected MST weights for validation - long long int total_weight_gpu = 0; + Value total_weight_gpu = 0; for (int iter = 0; iter < graph.edges; ++iter) { total_weight_gpu += h_mst_output[iter] * graph.edge_values[iter]; } // correctness validation - long long int total_weight_cpu = - SimpleReferenceMST(graph.edge_values, graph); + Value total_weight_cpu = SimpleReferenceMST(graph.edge_values, graph); if (total_weight_cpu == total_weight_gpu) { // print the edge pairs in the minimum spanning tree DisplaySolution(graph, h_mst_output); printf("\nCORRECT.\n"); + std::cout << "CPU Total Weight = " << total_weight_cpu << std::endl; + std::cout << "GPU Total Weight = " << total_weight_gpu << std::endl; } else { - printf("INCORRECT. \n" - "CPU Computed Total Weight = %lld\n" - "GPU Computed Total Weight = %lld\n", - total_weight_cpu, total_weight_gpu); + printf("INCORRECT.\n"); + std::cout << "CPU Total Weight = " << total_weight_cpu << std::endl; + std::cout << "GPU Total Weight = " << total_weight_gpu << std::endl; } } @@ -316,17 +343,16 @@ void RunTests( */ template void RunTests( - const Csr &graph, - CommandLineArgs &args, - mgpu::CudaContext& context) + const Csr & graph, + CommandLineArgs & args, + mgpu::CudaContext & context) { - bool instrumented = false; // do not collect instrumentation from kernels - int max_grid_size = 0; // maximum grid size (up to the enactor) - int num_gpus = 1; // number of GPUs for multi-gpu enactor to use - g_quick = false; // Whether or not to skip ref validation + bool instrumented = 0; // do not collect instrumentation from kernels + int max_grid_size = 0; // maximum grid size (up to the enactor) + int num_gpus = 1; // number of GPUs for multi-gpu enactor to use + g_quick = 0; // Whether or not to skip ref validation instrumented = args.CheckCmdLineFlag("instrumented"); - g_quick = args.CheckCmdLineFlag("quick"); g_verbose = args.CheckCmdLineFlag("v"); @@ -376,12 +402,12 @@ int main(int argc, char** argv) if (graph_type == "market") { - // matrix-market coordinate-formatted graph file - typedef int VertexId; // use as the vertex identifier type - typedef int Value; // use as the value type - typedef int SizeT; // use as the graph size type + // currently support Value type: int, float, double + typedef int VertexId; // use as the vertex identifier + typedef int Value; // use as the value type + typedef int SizeT; // use as the graph size // default value for stream_from_host is false if (graph_args < 1) @@ -396,28 +422,28 @@ int main(int argc, char** argv) // template argument = true because the graph has edge values Csr csr(false); if (graphio::BuildMarketGraph( - market_filename, - csr, - g_undirected, - false) != 0) { return 1; } + market_filename, csr, g_undirected, false) != 0) { return 1; } - // display graph - // csr.DisplayGraph(); + // display input graph + // csr.DisplayGraph(true); - /*************************************************************** - * To make sure two graphs have same weight value for each edge * - * we have to change ll_value = rand()%64 in market.cuh file to * - * some NON-RANDOM value if the original graph does NOT contain * - * weight per edge. Note it only support FULLY-CONNECTED graphs * - ***************************************************************/ - - // run GPU tests - RunTests(csr, args, *context); + /************************************************************************** + * Note: Minimum Spanning Tree only supports undirected, connected graphs * + **************************************************************************/ + // test graph connectivity + if (IsConnected(csr)) + { + RunTests(csr, args, *context); + } + else + { + fprintf(stderr, "Unsupported non-fully connected graph input.\n"); + } } else { - fprintf(stderr, "Unspecified graph type\n"); + fprintf(stderr, "Unspecified graph type.\n"); return 1; } @@ -428,4 +454,4 @@ int main(int argc, char** argv) // Local Variables: // mode:c++ // c-file-style: "NVIDIA" -// End: +// End diff --git a/tests/sssp/ppopp-test.sh b/tests/sssp/ppopp-test.sh index cbc55562d..8934de90b 100644 --- a/tests/sssp/ppopp-test.sh +++ b/tests/sssp/ppopp-test.sh @@ -1,7 +1,7 @@ mkdir -p eval/PPOPP15 for i in 1-soc 2-bitcoin 3-kron 6-roadnet do - echo ./bin/test_sssp_6.5_x86_64 market /data/PPOPP15/$i.mtx --src=0 --undirected --iteration-num=10 - ./bin/test_sssp_6.5_x86_64 market /data/PPOPP15/$i.mtx --src=0 --undirected --iteration-num=10 --delta-factor=32 > eval/PPOPP15/$i.txt + echo ./bin/test_sssp_7.0_x86_64 market /data/PPOPP15/$i.mtx --src=0 --undirected --iteration-num=10 + ./bin/test_sssp_7.0_x86_64 market /data/PPOPP15/$i.mtx --src=0 --undirected --iteration-num=10 --delta-factor=32 > eval/PPOPP15/$i.txt sleep 1 done diff --git a/tests/sssp/test_sssp.cu b/tests/sssp/test_sssp.cu index b9075bbc8..98663fbac 100644 --- a/tests/sssp/test_sssp.cu +++ b/tests/sssp/test_sssp.cu @@ -265,10 +265,10 @@ template< typename SizeT, bool MARK_PREDECESSORS> void SimpleReferenceSssp( - const Csr &graph, - Value *node_values, - VertexId *node_preds, - VertexId src) + const Csr &graph, + Value *node_values, + VertexId *node_preds, + VertexId src) { using namespace boost; @@ -279,11 +279,10 @@ void SimpleReferenceSssp( typedef graph_traits::vertex_descriptor vertex_descriptor; typedef graph_traits::edge_descriptor edge_descriptor; - typedef std::pair Edge; + typedef std::pair Edge; - Edge* edges = (Edge*)malloc(sizeof(Edge)*graph.edges); - unsigned int *weight = - (unsigned int*)malloc(sizeof(unsigned int)*graph.edges); + Edge *edges = ( Edge*)malloc(sizeof( Edge)*graph.edges); + Value *weight = (Value*)malloc(sizeof(Value)*graph.edges); for (int i = 0; i < graph.nodes; ++i) { @@ -296,7 +295,7 @@ void SimpleReferenceSssp( Graph g(edges, edges + graph.edges, weight, graph.nodes); - std::vector d(graph.nodes); + std::vector d(graph.nodes); std::vector p(graph.nodes); vertex_descriptor s = vertex(src, g); @@ -309,28 +308,30 @@ void SimpleReferenceSssp( CpuTimer cpu_timer; cpu_timer.Start(); - if (MARK_PREDECESSORS) - dijkstra_shortest_paths( - g, s, - predecessor_map(boost::make_iterator_property_map(p.begin(), get(boost::vertex_index, g))). - distance_map(boost::make_iterator_property_map(d.begin(), get(boost::vertex_index, g)))); - else - dijkstra_shortest_paths( - g, s, - distance_map(boost::make_iterator_property_map(d.begin(), get(boost::vertex_index, g)))); + if (MARK_PREDECESSORS) { + dijkstra_shortest_paths(g, s, + predecessor_map(boost::make_iterator_property_map( + p.begin(), get(boost::vertex_index, g))).distance_map( + boost::make_iterator_property_map( + d.begin(), get(boost::vertex_index, g)))); + } else { + dijkstra_shortest_paths(g, s, + distance_map(boost::make_iterator_property_map( + d.begin(), get(boost::vertex_index, g)))); + } cpu_timer.Stop(); float elapsed = cpu_timer.ElapsedMillis(); printf("CPU SSSP finished in %lf msec.\n", elapsed); - Coo* sort_dist = NULL; - Coo* sort_pred = NULL; - sort_dist = (Coo*)malloc( - sizeof(Coo) * graph.nodes); - if (MARK_PREDECESSORS) - sort_pred = (Coo*)malloc( - sizeof(Coo) * graph.nodes); - + Coo* sort_dist = NULL; + Coo* sort_pred = NULL; + sort_dist = (Coo*)malloc( + sizeof(Coo) * graph.nodes); + if (MARK_PREDECESSORS) { + sort_pred = (Coo*)malloc( + sizeof(Coo) * graph.nodes); + } graph_traits < Graph >::vertex_iterator vi, vend; for (tie(vi, vend) = vertices(g); vi != vend; ++vi) { @@ -339,7 +340,7 @@ void SimpleReferenceSssp( } std::stable_sort( sort_dist, sort_dist + graph.nodes, - RowFirstTupleCompare >); + RowFirstTupleCompare >); if (MARK_PREDECESSORS) { @@ -350,21 +351,21 @@ void SimpleReferenceSssp( } std::stable_sort( sort_pred, sort_pred + graph.nodes, - RowFirstTupleCompare >); + RowFirstTupleCompare< Coo >); } for (int i = 0; i < graph.nodes; ++i) { node_values[i] = sort_dist[i].col; } - if (MARK_PREDECESSORS) + if (MARK_PREDECESSORS) { for (int i = 0; i < graph.nodes; ++i) { node_preds[i] = sort_pred[i].col; } - - free(sort_dist); - if (MARK_PREDECESSORS) free(sort_pred); + } + if (sort_dist) free(sort_dist); + if (sort_pred) free(sort_pred); } @@ -687,6 +688,7 @@ void RunTests( parameter -> gpu_idx = gpu_idx; parameter -> streams = streams; + // source vertex to start args.GetCmdLineArgument("src", src_str); if (src_str.empty()) { parameter->src = 0; @@ -704,7 +706,7 @@ void RunTests( args.GetCmdLineArgument("traversal-mode", parameter->traversal_mode); if (parameter->traversal_mode == -1) { - parameter->traversal_mode = graph->GetAverageDegree() > 8 ? 0 : 1; + parameter->traversal_mode = 0; } printf("src = %lld\n", parameter->src); @@ -780,8 +782,7 @@ int main( int argc, char** argv) if (graph_args < 1) { Usage(); return 1; } if (graph_type == "market") { - // Matrix-market coordinate-formatted graph file - + // Matrix-market coordinate-formatted graph file char *market_filename = (graph_args == 2) ? argv[2] : NULL; if (graphio::BuildMarketGraph( market_filename, diff --git a/tests/vis/Makefile b/tests/vis/Makefile new file mode 100644 index 000000000..7931cd948 --- /dev/null +++ b/tests/vis/Makefile @@ -0,0 +1,108 @@ +# ----------------------------------------------------------------------------- +# Gunrock -- High-Performance Graph Primitives on GPU +# ----------------------------------------------------------------------------- +# This source code is distributed under the terms of LICENSE.TXT +# in the root directory of this source distribution. +# ----------------------------------------------------------------------------- +# Build script for project +# ----------------------------------------------------------------------------- + +force64 = 1 +NVCC = "$(shell which nvcc)" +NVCC_VERSION = $(strip $(shell nvcc --version | grep release | sed 's/.*release //' | sed 's/,.*//')) + +KERNELS = + +# detect OS +OSUPPER = $(shell uname -s 2>/dev/null | tr [:lower:] [:upper:]) + +# ----------------------------------------------------------------------------- +# Gen targets +# ----------------------------------------------------------------------------- + +GEN_SM35 = -gencode=arch=compute_35,code=\"sm_35,compute_35\" +GEN_SM30 = -gencode=arch=compute_30,code=\"sm_30,compute_30\" +SM_TARGETS = $(GEN_SM35) + +# ----------------------------------------------------------------------------- +# Libs +# ----------------------------------------------------------------------------- + + +# ----------------------------------------------------------------------------- +# Includes +# ----------------------------------------------------------------------------- + +CUDA_INC = "$(shell dirname $(NVCC))/../include" +MGPU_INC = "../../externals/moderngpu/include" +INC = -I$(CUDA_INC) -I$(MGPU_INC) -I.. -I../.. + +# ----------------------------------------------------------------------------- +# Defines +# ----------------------------------------------------------------------------- + +DEFINES = + +# ----------------------------------------------------------------------------- +# Compiler Flags +# ----------------------------------------------------------------------------- + +ifneq ($(force64), 1) + # Compile with 32-bit device pointers by default + ARCH_SUFFIX = i386 + ARCH = -m32 +else + ARCH_SUFFIX = x86_64 + ARCH = -m64 +endif + +NVCCFLAGS = -Xcudafe -\# + +ifeq (WIN_NT, $(findstring WIN_NT, $(OSUPPER))) + NVCCFLAGS += -Xcompiler /bigobj -Xcompiler /Zm500 +endif + + +ifeq ($(verbose), 1) + NVCCFLAGS += -v +endif + +ifeq ($(keep), 1) + NVCCFLAGS += -keep +endif + +ifdef maxregisters + NVCCFLAGS += -maxrregcount $(maxregisters) +endif + +# ----------------------------------------------------------------------------- +# Dependency Lists +# ----------------------------------------------------------------------------- + +DEPS = ./Makefile \ + $(wildcard ../../gunrock/util/*.cuh) \ + $(wildcard ../../gunrock/util/**/*.cuh) \ + $(wildcard ../../gunrock/*.cuh) \ + $(wildcard ../../gunrock/graphio/*.cuh) \ + $(wildcard ../../gunrock/oprtr/*.cuh) \ + $(wildcard ../../gunrock/oprtr/**/*.cuh) \ + $(wildcard ../../gunrock/app/*.cuh) \ + $(wildcard ../../gunrock/app/**/*.cuh) + +# ----------------------------------------------------------------------------- +# (make test) Test driver for +# ----------------------------------------------------------------------------- + +test: bin/test_vis_$(NVCC_VERSION)_$(ARCH_SUFFIX) + +bin/test_vis_$(NVCC_VERSION)_$(ARCH_SUFFIX) : test_vis.cu ../../gunrock/util/test_utils.cu ../../gunrock/util/error_utils.cu ../../externals/moderngpu/src/mgpucontext.cu ../../externals/moderngpu/src/mgpuutil.cpp $(DEPS) + mkdir -p bin + $(NVCC) $(DEFINES) $(SM_TARGETS) -o bin/test_vis_$(NVCC_VERSION)_$(ARCH_SUFFIX) test_vis.cu ../../gunrock/util/test_utils.cu ../../gunrock/util/error_utils.cu ../../externals/moderngpu/src/mgpucontext.cu ../../externals/moderngpu/src/mgpuutil.cpp $(NVCCFLAGS) $(ARCH) $(INC) -O3 + +# ----------------------------------------------------------------------------- +# Clean +# ----------------------------------------------------------------------------- + +clean : + rm -f bin/*_$(NVCC_VERSION)_$(ARCH_SUFFIX)* + rm -f *.i* *.cubin *.cu.c *.cudafe* *.fatbin.c *.ptx *.hash *.cu.cpp *.o diff --git a/tests/vis/run.sh b/tests/vis/run.sh new file mode 100644 index 000000000..708cedfec --- /dev/null +++ b/tests/vis/run.sh @@ -0,0 +1,28 @@ +#!/bin/sh + +# get all execution files in ./bin +files=(./bin/*) +# split file names into arr +arr=$(echo $files | tr " " "\n") +max_ver_num="$" +exe_file=${arr[0]} +# iterate over all file names to get the largest version number +for x in $arr +do + output=$(grep -o "[0-9]\.[0-9]" <<<"$x") + if [ "$output" \> "$max_ver_num" ]; then + exe_file=$x + fi +done + +# put OS and Device type here +SUFFIX="ubuntu12.04.k40c" + +mkdir -p eval/$SUFFIX + +for i in test_bc +do + echo $exe_file market ../../dataset/small/$i.mtx + $exe_file market ../../dataset/small/$i.mtx > eval/$SUFFIX/$i.$SUFFIX.txt + sleep 1 +done diff --git a/tests/vis/test_vis.cu b/tests/vis/test_vis.cu new file mode 100644 index 000000000..3584b9bff --- /dev/null +++ b/tests/vis/test_vis.cu @@ -0,0 +1,346 @@ +// ---------------------------------------------------------------------------- +// Gunrock -- High-Performance Graph Primitives on GPU +// ---------------------------------------------------------------------------- +// This source code is distributed under the terms of LICENSE.TXT +// in the root directory of this source distribution. +// ---------------------------------------------------------------------------- + +/** + * @file test_vis.cuh + * @brief Simple test driver program for Vertex-Induced Subgraph + */ + +#include +#include +#include +#include +#include + +// utilities for correctness checking +#include + +// graph construction utilities +#include + +// primitive-specific headers include +#include +#include +#include + +// gunrock abstraction graph operators +#include +#include + +#include + +using namespace gunrock; +using namespace gunrock::util; +using namespace gunrock::oprtr; +using namespace gunrock::app::vis; + +// ---------------------------------------------------------------------------- +// Defines, constants, globals +// ---------------------------------------------------------------------------- + +bool g_verbose; +bool g_undirected; +bool g_quick; +bool g_stream_from_host; + +// ---------------------------------------------------------------------------- +// Housekeeping Routines +// ---------------------------------------------------------------------------- +void Usage() { + printf( + " test_vis [--undirected] [--quick]\n" + " [--device=] [--instrumented] [--iteration-num=]\n" + " [--v] [--traversal-mode=<0|1>] [--queue-sizing=]\n" + "Graph types and arguments:\n" + " market \n" + " Reads a Matrix-Market coordinate-formatted graph,\n" + " edges from STDIN (or from the optionally-specified file)\n" + " --device= Set GPU device to run. [Default: 0]\n" + " --undirected Convert the graph to undirected\n" + " --instrumented Keep kernels statics [Default: Disable]\n" + " total_queued, search_depth and avg_duty\n" + " (a relative indicator of load imbalance)\n" + " --quick Skip the CPU validation [Default: false]\n" + " --queue-sizing= Allocates a frontier queue sized at: \n" + " (graph-edges * ) [Default: 1.0]\n" + " --v Print verbose per iteration debug info\n" + " --iteration-num= Number of tests to run [Default: 1]\n" + " --traversal-mode=<0 | 1> Set strategy, 0 for Load-Balanced,\n" + " 1 for Dynamic-Cooperative\n" + " [Default: according to topology]\n"); +} + +/** + * @brief Displays primitive result + * + * @tparam VertexId + * @tparam SizeT + * @tparam Value + */ +template +void DisplaySolution(const Csr &graph) { + // TODO: code to print out results +} + + +/** + * @brief Performance / Evaluation statistics + */ +struct Stats { + const char *name; + Statistic num_iterations; + Stats() : name(NULL), num_iterations() {} + explicit Stats(const char *name) : name(name), num_iterations() {} +}; + +/** + * @brief Displays timing and correctness statistics + * + * @tparam VertexId + * @tparam SizeT + * @tparam Value + * + * @param[in] stats Reference to the Stats object + * @param[in] graph Reference to the CSR graph we process on + */ +template +void DisplayStats(const Stats &stats, const Csr &graph, + const float elapsed, const long long iterations) { + printf("[%s] finished.\n", stats.name); + printf("elapsed: %.4f ms\n", elapsed); +} + +// ---------------------------------------------------------------------------- +// Testing Routines +// ---------------------------------------------------------------------------- + +/** + * @brief A simple CPU-based reference implementation. + * + * @tparam VertexId + * @tparam SizeT + * @tparam Value + * + * @param[in] graph Reference to the CSR graph we process on + */ +template +void SimpleReference(const Csr &graph) { + // initialization + + // perform calculation + + CpuTimer cpu_timer; + cpu_timer.Start(); + + // TODO: CPU validation code here + + cpu_timer.Stop(); + + float cpu_elapsed = cpu_timer.ElapsedMillis(); + printf("CPU reference finished in %lf ms.\n\n", cpu_elapsed); +} + +/** + * @brief Sample test + * + * @tparam VertexId + * @tparam SizeT + * @tparam Value + * + * @param[in] graph Reference to the CSR graph we process on + * @param[in] max_grid_size Maximum CTA occupancy + * @param[in] num_gpus Number of GPUs + * @param[in] max_queue_sizing Scaling factor used in edge mapping + * @param[in] iterations Number of iterations for running the test + * @param[in] traversal_mode Strategy: Load-balanced or Dynamic cooperative + * @param[in] context CudaContext pointer for ModernGPU APIs + * + */ +template +void RunTest( + const Csr &graph, + int max_grid_size, + int num_gpus, + double max_queue_sizing, + int iterations, + int traversal_mode, + CudaContext& context) { + typedef VISProblem Problem; + + // allocate host-side array (for both reference and GPU-computed results) + VertexId *r_labels = (VertexId*)malloc(sizeof(VertexId) * graph.nodes); + VertexId *h_labels = (VertexId*)malloc(sizeof(VertexId) * graph.nodes); + + // allocate primitive enactor map + VISEnactor enactor(g_verbose); + + // allocate primitive problem on GPU + Problem *csr_problem = new Problem; + util::GRError(csr_problem->Init( + g_stream_from_host, + graph, + num_gpus), + "Problem Initialization Failed", __FILE__, __LINE__); + + Stats *stats = new Stats("Vertex-Induced Subgraph"); + + // perform calculation + GpuTimer gpu_timer; + + float elapsed = 0.0f; + + for (int iter = 0; iter < iterations; ++iter) { + util::GRError( + csr_problem->Reset(enactor.GetFrontierType(), + max_queue_sizing), + "Problem Data Reset Failed", __FILE__, __LINE__); + gpu_timer.Start(); + util::GRError( + enactor.template Enact(context, csr_problem, + max_grid_size, traversal_mode), + "Problem Enact Failed", __FILE__, __LINE__); + gpu_timer.Stop(); + elapsed += gpu_timer.ElapsedMillis(); + } + + elapsed /= iterations; + + // extract results + util::GRError(csr_problem->Extract(h_labels), + "Problem Data Extraction Failed", __FILE__, __LINE__); + + // compute reference CPU validation solution + if (!g_quick) { + printf("-- computing reference value ... (currently missing)\n"); + SimpleReference(graph); + printf("-- validation: (currently missing)\n"); + } + + // display solution + DisplaySolution(graph); + + // display statistics + VertexId num_iteratios = 0; + enactor.GetStatistics(num_iteratios); + DisplayStats(*stats, graph, elapsed, num_iteratios); + + // clean up + delete stats; + if (csr_problem) delete csr_problem; + if (r_labels) free(r_labels); + if (h_labels) free(h_labels); + + cudaDeviceSynchronize(); +} + +/** + * @brief Test entry + * + * @tparam VertexId + * @tparam SizeT + * @tparam Value + * + * @param[in] graph Reference to the CSR graph we process on + * @param[in] args Reference to the command line arguments + * @param[in] context CudaContext pointer for ModernGPU APIs + */ +template +void RunTest( + Csr &graph, + CommandLineArgs &args, + CudaContext& context) { + bool instrumented = 0; // Collect instrumentation from kernels + int max_grid_size = 0; // Maximum grid size (0: up to the enactor) + int num_gpus = 1; // Number of GPUs for multi-GPU enactor + double max_queue_sizing = 1.0; // Maximum scaling factor for work queues + int iterations = 1; // Number of runs for testing + int traversal_mode = -1; // Load-balanced or Dynamic cooperative + g_quick = 0; // Whether or not to skip CPU validation + + // choose traversal mode + args.GetCmdLineArgument("traversal-mode", traversal_mode); + if (traversal_mode == -1) { + traversal_mode = graph.GetAverageDegree() > 8 ? 0 : 1; + } + + g_verbose = args.CheckCmdLineFlag("v"); + instrumented = args.CheckCmdLineFlag("instrumented"); + g_quick = args.CheckCmdLineFlag("quick"); + + args.GetCmdLineArgument("iteration-num", iterations); + args.GetCmdLineArgument("grid-size", max_grid_size); + args.GetCmdLineArgument("queue-sizing", max_queue_sizing); + + if (instrumented) { + RunTest( + graph, + max_grid_size, + num_gpus, + max_queue_sizing, + iterations, + traversal_mode, + context); + } else { + RunTest( + graph, + max_grid_size, + num_gpus, + max_queue_sizing, + iterations, + traversal_mode, + context); + } +} + +// ---------------------------------------------------------------------------- +// Main +// ---------------------------------------------------------------------------- +int main(int argc, char** argv) { + CommandLineArgs args(argc, argv); + if ((argc < 2) || (args.CheckCmdLineFlag("help"))) { + Usage(); + return 1; + } + + int device = 0; + args.GetCmdLineArgument("device", device); + ContextPtr context = mgpu::CreateCudaDevice(device); + + // parse graph-construction parameters + g_undirected = args.CheckCmdLineFlag("undirected"); + + std::string graph_type = argv[1]; + int flags = args.ParsedArgc(); + int graph_args = argc - flags - 1; + if (graph_args < 1) { + Usage(); + return 1; + } + + typedef int VertexId; // Use as the vertex identifier + typedef int SizeT; // Use as the graph size type + typedef int Value; // Use as the value type + + if (graph_type == "market") { + // matrix-market coordinate-formatted graph + Csr csr(false); + char *name = (graph_args == 2) ? argv[2] : NULL; + if (graphio::BuildMarketGraph( + name, csr, g_undirected, false) != 0) { + return 1; + } + + csr.DisplayGraph(); // display graph adjacent list + csr.PrintHistogram(); // display graph histogram + RunTest(csr, args, *context); // run sample test + + } else { + fprintf(stderr, "Unspecified graph type\n"); + return 1; + } + return 0; +}