From 6ba1b57ac458639dd4d3639feb3d5fabc4cfacdf Mon Sep 17 00:00:00 2001
From: Yuduo Wu <yudwu@ucdavis.edu>
Date: Tue, 2 Jun 2015 16:11:56 -0700
Subject: [PATCH 01/36] small makefile fix for simple_example

---
 simple_example/Makefile | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/simple_example/Makefile b/simple_example/Makefile
index 9762bce03..842ba0e19 100644
--- a/simple_example/Makefile
+++ b/simple_example/Makefile
@@ -94,7 +94,7 @@ else
 	ARCH = -m64
 endif
 
-NVCCFLAGS = -Xptxas -v -Xcudafe -\#
+NVCCFLAGS = -Xptxas -v -Xcudafe -\# -lineinfo
 
 ifeq (WIN_NT, $(findstring WIN_NT, $(OSUPPER)))
 	NVCCFLAGS += -Xcompiler /bigobj -Xcompiler /Zm500
@@ -123,15 +123,15 @@ endif
 # Dependency Lists
 #-------------------------------------------------------------------------------
 
-DEPS =			./Makefile \
-				$(wildcard ../gunrock/util/*.cuh) \
-				$(wildcard ../gunrock/util/**/*.cuh) \
-				$(wildcard ../gunrock/*.cuh) \
-				$(wildcard ../gunrock/graphio/*.cuh) \
-				$(wildcard ../gunrock/oprtr/*.cuh) \
-				$(wildcard ../gunrock/oprtr/**/*.cuh) \
-				$(wildcard ../gunrock/app/*.cuh) \
-				$(wildcard ../gunrock/app/**/*.cuh)
+DEPS = ./Makefile \
+	$(wildcard ../gunrock/util/*.cuh) \
+	(wildcard ../gunrock/util/**/*.cuh) \
+	$(wildcard ../gunrock/*.cuh) \
+	$(wildcard ../gunrock/graphio/*.cuh) \
+	$(wildcard ../gunrock/oprtr/*.cuh) \
+	$(wildcard ../gunrock/oprtr/**/*.cuh) \
+	$(wildcard ../gunrock/app/*.cuh) \
+	$(wildcard ../gunrock/app/**/*.cuh)
 
 #-------------------------------------------------------------------------------
 # (make simple) Simple example driver for three primitives: CC, BFS and BC
@@ -139,9 +139,9 @@ DEPS =			./Makefile \
 
 simple: bin/simple_example_$(NVCC_VERSION)_$(ARCH_SUFFIX)
 
-bin/simple_example_$(NVCC_VERSION)_$(ARCH_SUFFIX) : simple_example.cu ../externals/moderngpu/src/mgpucontext.cu ../externals/moderngpu/src/mgpuutil.cpp $(DEPS)
+bin/simple_example_$(NVCC_VERSION)_$(ARCH_SUFFIX) : simple_example.cu cpu_graph_lib.cpp ../gunrock/util/error_utils.cu ../externals/moderngpu/src/mgpucontext.cu ../externals/moderngpu/src/mgpuutil.cpp $(DEPS)
 	mkdir -p bin
-	$(NVCC) $(DEFINES) $(SM_TARGETS) -o bin/simple_example_$(NVCC_VERSION)_$(ARCH_SUFFIX) simple_example.cu ../externals/moderngpu/src/mgpucontext.cu ../externals/moderngpu/src/mgpuutil.cpp $(NVCCFLAGS) $(ARCH) $(INC) -lcuda -O3
+	$(NVCC) $(DEFINES) $(SM_TARGETS) -o bin/simple_example_$(NVCC_VERSION)_$(ARCH_SUFFIX) simple_example.cu cpu_graph_lib.cpp ../gunrock/util/error_utils.cu ../externals/moderngpu/src/mgpucontext.cu ../externals/moderngpu/src/mgpuutil.cpp $(NVCCFLAGS) $(ARCH) $(INC) -lcuda -O3
 
 #-------------------------------------------------------------------------------
 # Clean

From e01e65c5f9c5ead89d5dc8df5e47c64c7648379a Mon Sep 17 00:00:00 2001
From: Yuduo Wu <yudwu@ucdavis.edu>
Date: Tue, 9 Jun 2015 09:54:57 -0700
Subject: [PATCH 02/36] fix a small bug due to missing one condition check

---
 gunrock/app/mst/mst_functor.cuh | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/gunrock/app/mst/mst_functor.cuh b/gunrock/app/mst/mst_functor.cuh
index 8cf90fe0e..1f5393571 100644
--- a/gunrock/app/mst/mst_functor.cuh
+++ b/gunrock/app/mst/mst_functor.cuh
@@ -111,7 +111,8 @@ struct EdgeFunctor
     VertexId s_id, VertexId d_id, DataSlice *problem,
     VertexId e_id = 0, VertexId e_id_in = 0)
   {
-    return problem->d_successors[s_id] == d_id;
+    return problem->d_successors[s_id] == d_id &&
+      problem->d_reduced_vals[s_id] == problem->d_edge_weights[e_id];
   }
 
   /**

From 9391686be26eb892794cab2760da52e471034d5d Mon Sep 17 00:00:00 2001
From: Yuduo Wu <yudwu@ucdavis.edu>
Date: Tue, 9 Jun 2015 22:27:02 -0700
Subject: [PATCH 03/36] use random value if no input weight values

---
 gunrock/graphio/market.cuh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gunrock/graphio/market.cuh b/gunrock/graphio/market.cuh
index 3795d3bca..494200f4b 100644
--- a/gunrock/graphio/market.cuh
+++ b/gunrock/graphio/market.cuh
@@ -141,7 +141,7 @@ int ReadMarketStream(
                     if (coo) free(coo);
                     return -1;
                 } else if (num_input == 2) {
-                    ll_value = 1;
+                    ll_value = rand() % 64;
                 }
             } else {
                 if (sscanf(line, "%lld %lld", &ll_col, &ll_row) != 2) {

From a560c88260b74672c3f26f938facb653e477af7a Mon Sep 17 00:00:00 2001
From: Yuduo Wu <yudwu@ucdavis.edu>
Date: Wed, 10 Jun 2015 08:31:45 -0700
Subject: [PATCH 04/36] oops missing one $ sign in makefile

---
 simple_example/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/simple_example/Makefile b/simple_example/Makefile
index 842ba0e19..5fade4d66 100644
--- a/simple_example/Makefile
+++ b/simple_example/Makefile
@@ -125,7 +125,7 @@ endif
 
 DEPS = ./Makefile \
 	$(wildcard ../gunrock/util/*.cuh) \
-	(wildcard ../gunrock/util/**/*.cuh) \
+	$(wildcard ../gunrock/util/**/*.cuh) \
 	$(wildcard ../gunrock/*.cuh) \
 	$(wildcard ../gunrock/graphio/*.cuh) \
 	$(wildcard ../gunrock/oprtr/*.cuh) \

From 329f4e7521614e8f47db8648f408be7574d71f52 Mon Sep 17 00:00:00 2001
From: Yangzihao Wang <yzhwang@ucdavis.edu>
Date: Thu, 11 Jun 2015 13:02:40 -0700
Subject: [PATCH 05/36] changed the BC forward phase cache offset index to
 stack. Still have bugs for single node BC.

---
 gunrock/app/bc/bc_enactor.cuh | 42 +++++++++++++++++------------------
 gunrock/app/bc/bc_functor.cuh | 12 +++++-----
 gunrock/util/test_utils.h     |  1 +
 3 files changed, 28 insertions(+), 27 deletions(-)

diff --git a/gunrock/app/bc/bc_enactor.cuh b/gunrock/app/bc/bc_enactor.cuh
index 49526a5f2..2a0c621f7 100644
--- a/gunrock/app/bc/bc_enactor.cuh
+++ b/gunrock/app/bc/bc_enactor.cuh
@@ -242,8 +242,8 @@ class BCEnactor : public EnactorBase
 
             frontier_attribute.queue_reset          = true;
 
-            std::vector<SizeT> forward_queue_offsets(graph_slice->nodes);
-            forward_queue_offsets.push_back(0);
+            std::stack<SizeT> forward_queue_offsets;
+            forward_queue_offsets.push(0);
 
             if (AdvanceKernelPolicy::ADVANCE_MODE == gunrock::oprtr::advance::LB) {
                 if (retval = util::GRError(cudaMalloc(
@@ -259,12 +259,12 @@ class BCEnactor : public EnactorBase
             while (done[0] < 0) {
 
                 if (frontier_attribute.queue_length > 0 && enactor_stats.iteration > 0) {
-                    SizeT cur_offset = forward_queue_offsets.back();
+                    SizeT cur_offset = forward_queue_offsets.top();
                     //printf("offset:%d, current length:%d\n", cur_offset, frontier_attribute.queue_length);
                     util::MemsetCopyVectorKernel<<<128, 128>>>(&problem->data_slices[0]->d_forward_output[cur_offset], graph_slice->frontier_queues.d_keys[frontier_attribute.selector], frontier_attribute.queue_length);
                     //util::DisplayDeviceResults(graph_slice->frontier_queues.d_keys[frontier_attribute.selector], frontier_attribute.queue_length);
                     //util::DisplayDeviceResults(&problem->data_slices[0]->d_forward_output[cur_offset], frontier_attribute.queue_length);
-                    forward_queue_offsets.push_back(frontier_attribute.queue_length+cur_offset);
+                    forward_queue_offsets.push(frontier_attribute.queue_length+cur_offset);
                 }
 
                 // Edge Map
@@ -351,15 +351,12 @@ class BCEnactor : public EnactorBase
                 if (/*DEBUG &&*/ (retval = util::GRError(cudaThreadSynchronize(), "filter_forward::Kernel failed", __FILE__, __LINE__))) break;
                 cudaEventQuery(throttle_event); // give host memory mapped visibility to GPU updates
 
-
                 frontier_attribute.queue_index++;
                 frontier_attribute.selector ^= 1;
-                enactor_stats.iteration++;
 
                 if (AdvanceKernelPolicy::ADVANCE_MODE == gunrock::oprtr::advance::LB) {
                     if (retval = work_progress.GetQueueLength(frontier_attribute.queue_index, frontier_attribute.queue_length)) break;
                     } 
-
                 if (INSTRUMENT || DEBUG) {
                     if (retval = work_progress.GetQueueLength(frontier_attribute.queue_index, frontier_attribute.queue_length)) break;
                     if (DEBUG) printf(", %lld", (long long) frontier_attribute.queue_length);
@@ -373,6 +370,8 @@ class BCEnactor : public EnactorBase
                 // Check if done
                 if (done[0] == 0) break;
 
+                enactor_stats.iteration++;
+
                 if (DEBUG) printf("\n%lld", (long long) enactor_stats.iteration);
 
             }
@@ -396,8 +395,12 @@ class BCEnactor : public EnactorBase
 
             if (DEBUG) printf("\nStart backward phase\n%lld", (long long) enactor_stats.iteration);
             // Backward BC iteration
-            for (int iter = forward_queue_offsets.size()-3; iter >=0; --iter) {
-                frontier_attribute.queue_length = forward_queue_offsets[iter+1]-forward_queue_offsets[iter];  
+            SizeT top_offset = forward_queue_offsets.top();
+            if (DEBUG) printf("top offsets:%d\n", top_offset);
+            forward_queue_offsets.pop();
+            while (!forward_queue_offsets.empty()) {
+                frontier_attribute.queue_length = top_offset-forward_queue_offsets.top();
+                printf("queue length:%d\n", frontier_attribute.queue_length);
                 /*frontier_attribute.queue_length        = graph_slice->nodes;
                 // Fill in the frontier_queues
                 util::MemsetIdxKernel<<<128, 128>>>(graph_slice->frontier_queues.d_keys[0], graph_slice->nodes);
@@ -460,7 +463,7 @@ class BCEnactor : public EnactorBase
                 // Check if done
                 if (done[0] == 0) break;*/
                 // Edge Map
-                if (iter > 0) {
+                if (forward_queue_offsets.top() > 0) {
                     gunrock::oprtr::advance::LaunchKernel<AdvanceKernelPolicy, BCProblem, BackwardFunctor>(
                             d_done,
                             enactor_stats,
@@ -470,8 +473,8 @@ class BCEnactor : public EnactorBase
                             (bool*)NULL,
                             (bool*)NULL,
                             d_scanned_edges,
-                            &problem->data_slices[0]->d_forward_output[forward_queue_offsets[iter]],              // d_in_queue
-                            graph_slice->frontier_queues.d_keys[0],            // d_out_queue
+                            &problem->data_slices[0]->d_forward_output[forward_queue_offsets.top()],              // d_in_queue
+                            NULL,            // d_out_queue
                             (VertexId*)NULL,
                             (VertexId*)NULL,
                             graph_slice->d_row_offsets,
@@ -493,8 +496,8 @@ class BCEnactor : public EnactorBase
                             (bool*)NULL,
                             (bool*)NULL,
                             d_scanned_edges,
-                            &problem->data_slices[0]->d_forward_output[forward_queue_offsets[iter]],              // d_in_queue
-                            graph_slice->frontier_queues.d_keys[0],            // d_out_queue
+                            &problem->data_slices[0]->d_forward_output[0],              // d_in_queue
+                            NULL,            // d_out_queue
                             (VertexId*)NULL,
                             (VertexId*)NULL,
                             graph_slice->d_row_offsets,
@@ -508,7 +511,7 @@ class BCEnactor : public EnactorBase
                             gunrock::oprtr::advance::V2V);
                 }
 
-                if (/*DEBUG &&*/ (retval = util::GRError(cudaThreadSynchronize(), "filter_forward::Kernel failed", __FILE__, __LINE__))) break;
+                if (DEBUG && (retval = util::GRError(cudaThreadSynchronize(), "filter_forward::Kernel failed", __FILE__, __LINE__))) break;
                 cudaEventQuery(throttle_event); // give host memory mapped visibility to GPU updates
 
                 //frontier_attribute.queue_index++;
@@ -518,7 +521,6 @@ class BCEnactor : public EnactorBase
 
                 if (INSTRUMENT || DEBUG) {
                     if (retval = work_progress.GetQueueLength(frontier_attribute.queue_index, frontier_attribute.queue_length)) break;
-                    if (DEBUG) printf(", %lld", (long long) frontier_attribute.queue_length);
                     if (INSTRUMENT) {
                         if (retval = enactor_stats.filter_kernel_stats.Accumulate(
                             enactor_stats.filter_grid_size,
@@ -526,11 +528,9 @@ class BCEnactor : public EnactorBase
                             enactor_stats.total_lifetimes)) break;
                     }
                 }
-                // Check if done
-                if (done[0] == 0) break;
-
-                if (DEBUG) printf("\n%lld", (long long) enactor_stats.iteration-1);
-
+                top_offset = forward_queue_offsets.top();
+                forward_queue_offsets.pop();
+                if (DEBUG) printf("top offsets:%d\n", top_offset);
             }
             if (retval) break;
 
diff --git a/gunrock/app/bc/bc_functor.cuh b/gunrock/app/bc/bc_functor.cuh
index 4e0fdecfb..6b8e594d6 100644
--- a/gunrock/app/bc/bc_functor.cuh
+++ b/gunrock/app/bc/bc_functor.cuh
@@ -204,12 +204,12 @@ struct BackwardFunctor
         //Accumulate delta value
 
         //Accumulate bc value
-        //atomicAdd(&problem->d_ebc_values[e_id], result);
+        atomicAdd(&problem->d_ebc_values[e_id], result);
 
         if (s_id != problem->d_src_node[0]) {
             atomicAdd(&problem->d_deltas[s_id], result);
             atomicAdd(&problem->d_bc_values[s_id], result);
-        }
+       }
     }
 
     /**
@@ -305,17 +305,17 @@ struct BackwardFunctor2
         util::io::ModifiedLoad<ProblemData::COLUMN_READ_MODIFIER>::Ld(
             to_delta, problem->d_deltas + d_id);
 
-        //Value result = from_sigma / to_sigma * (1.0 + to_delta);
+        Value result = from_sigma / to_sigma * (1.0 + to_delta);
 
         //Accumulate delta value
 
         //Accumulate bc value
-        //atomicAdd(&problem->d_ebc_values[e_id], result);
+        atomicAdd(&problem->d_ebc_values[e_id], result);
 
-        /*if (s_id != problem->d_src_node[0]) {
+        if (s_id != problem->d_src_node[0]) {
             atomicAdd(&problem->d_deltas[s_id], result);
             atomicAdd(&problem->d_bc_values[s_id], result);
-        }*/
+        }
     }
 
     /**
diff --git a/gunrock/util/test_utils.h b/gunrock/util/test_utils.h
index b5e291587..c33d27d00 100644
--- a/gunrock/util/test_utils.h
+++ b/gunrock/util/test_utils.h
@@ -29,6 +29,7 @@
 #include <map>
 #include <string>
 #include <vector>
+#include <stack>
 #include <sstream>
 #include <iostream>
 #include <fstream>

From 01f997c474b4c95758fd027c625aae82e438ca43 Mon Sep 17 00:00:00 2001
From: Yangzihao Wang <yzhwang@ucdavis.edu>
Date: Fri, 12 Jun 2015 12:42:04 -0700
Subject: [PATCH 06/36] debugging bc bug. Added DisplayDeviceResults with given
 indices list.

---
 gunrock/app/bc/bc_enactor.cuh |  3 +--
 gunrock/util/test_utils.cuh   | 38 +++++++++++++++++++++++++++++++++++
 gunrock/util/test_utils.h     |  1 +
 3 files changed, 40 insertions(+), 2 deletions(-)

diff --git a/gunrock/app/bc/bc_enactor.cuh b/gunrock/app/bc/bc_enactor.cuh
index 2a0c621f7..24b4abe99 100644
--- a/gunrock/app/bc/bc_enactor.cuh
+++ b/gunrock/app/bc/bc_enactor.cuh
@@ -263,7 +263,6 @@ class BCEnactor : public EnactorBase
                     //printf("offset:%d, current length:%d\n", cur_offset, frontier_attribute.queue_length);
                     util::MemsetCopyVectorKernel<<<128, 128>>>(&problem->data_slices[0]->d_forward_output[cur_offset], graph_slice->frontier_queues.d_keys[frontier_attribute.selector], frontier_attribute.queue_length);
                     //util::DisplayDeviceResults(graph_slice->frontier_queues.d_keys[frontier_attribute.selector], frontier_attribute.queue_length);
-                    //util::DisplayDeviceResults(&problem->data_slices[0]->d_forward_output[cur_offset], frontier_attribute.queue_length);
                     forward_queue_offsets.push(frontier_attribute.queue_length+cur_offset);
                 }
 
@@ -400,7 +399,7 @@ class BCEnactor : public EnactorBase
             forward_queue_offsets.pop();
             while (!forward_queue_offsets.empty()) {
                 frontier_attribute.queue_length = top_offset-forward_queue_offsets.top();
-                printf("queue length:%d\n", frontier_attribute.queue_length);
+                util::DisplayDeviceResults(problem->data_slices[0]->d_sigmas, &problem->data_slices[0]->d_forward_output[forward_queue_offsets.top()], graph_slice->nodes, frontier_attribute.queue_length);
                 /*frontier_attribute.queue_length        = graph_slice->nodes;
                 // Fill in the frontier_queues
                 util::MemsetIdxKernel<<<128, 128>>>(graph_slice->frontier_queues.d_keys[0], graph_slice->nodes);
diff --git a/gunrock/util/test_utils.cuh b/gunrock/util/test_utils.cuh
index 91ca03c7e..93f223642 100644
--- a/gunrock/util/test_utils.cuh
+++ b/gunrock/util/test_utils.cuh
@@ -248,6 +248,44 @@ void DisplayDeviceResults(
     if (h_data) free(h_data);
 }
 
+/**
+ * Verify the contents of a device array match those
+ * of a host array
+ */
+template <typename DATATYPE, typename INDEXTYPE>
+void DisplayDeviceResults(
+    DATATYPE *d_data,
+    INDEXTYPE *d_indices,
+    size_t num_elements,
+    size_t num_indices)
+{
+    printf("num_elements:%d\n", num_elements);
+    printf("num_indices:%d\n", num_indices);
+    // Allocate array on host
+    DATATYPE *h_data = (DATATYPE*) malloc(num_elements * sizeof(DATATYPE));
+    INDEXTYPE *h_indices = (INDEXTYPE*) malloc(num_indices * sizeof(INDEXTYPE));
+
+    // Reduction data back
+    cudaMemcpy(h_data, d_data, sizeof(DATATYPE) * num_elements, cudaMemcpyDeviceToHost);
+    cudaMemcpy(h_indices, d_indices, sizeof(INDEXTYPE) * num_indices, cudaMemcpyDeviceToHost);
+
+    // Display data
+    printf("\n\nData:\n");
+    for (int i = 0; i < num_indices; i++)
+    {
+        PrintValue(h_indices[i]);
+        printf(":");
+        assert(h_indices[i] < num_elements);
+        PrintValue(h_data[h_indices[i]]);
+        printf(", ");
+    }
+    printf("\n\n");
+
+    // Cleanup
+    if (h_data) free(h_data);
+    if (h_indices) free(h_indices);
+}
+
 /******************************************************************************
  * Timing
  ******************************************************************************/
diff --git a/gunrock/util/test_utils.h b/gunrock/util/test_utils.h
index c33d27d00..8c5e9d573 100644
--- a/gunrock/util/test_utils.h
+++ b/gunrock/util/test_utils.h
@@ -26,6 +26,7 @@
 #include <math.h>
 #include <float.h>
 
+#include <cassert>
 #include <map>
 #include <string>
 #include <vector>

From 1d2ff81fdb1515b44bf7033b7076e2a32f33ebf7 Mon Sep 17 00:00:00 2001
From: Yuduo Wu <yudwu@ucdavis.edu>
Date: Mon, 15 Jun 2015 12:15:12 -0700
Subject: [PATCH 07/36] add graph connectivity test before running mst

---
 tests/mst/test_mst.cu | 44 +++++++++++++++++++++++++++++++++++++------
 1 file changed, 38 insertions(+), 6 deletions(-)

diff --git a/tests/mst/test_mst.cu b/tests/mst/test_mst.cu
index 6083d9bd5..4e295ad56 100644
--- a/tests/mst/test_mst.cu
+++ b/tests/mst/test_mst.cu
@@ -28,6 +28,7 @@
 #include <gunrock/graphio/market.cuh>
 
 // MST includes
+#include <gunrock/app/cc/cc_app.cu>
 #include <gunrock/app/mst/mst_enactor.cuh>
 #include <gunrock/app/mst/mst_problem.cuh>
 #include <gunrock/app/mst/mst_functor.cuh>
@@ -83,6 +84,11 @@ void Usage()
 /**
  * @brief Displays the MST result
  *
+ * @tparam VertexId
+ * @tparam Value
+ * @tparam SizeT
+ *
+ * @param[in] graph reference to the CSR graph we process on
  */
 ////////////////////////////////////////////////////////////////////////////////
 template<typename VertexId, typename Value, typename SizeT>
@@ -122,6 +128,26 @@ void DisplaySolution(const Csr<VertexId, Value, SizeT> &graph, int *mst_output)
   if (source) { delete [] source; }
 }
 
+/**
+ * @brief A simple connnectivity check utility
+ *
+ * @tparam VertexId
+ * @tparam Value
+ * @tparam SizeT
+ *
+ * @param[in] graph reference to the CSR graph we process on
+ */
+template<typename VertexId, typename Value, typename SizeT>
+bool IsConnected(const Csr<VertexId, Value, SizeT> & graph)
+{
+  // malloc output graph
+  GunrockGraph *graph_output =
+    (GunrockGraph*)malloc(sizeof(GunrockGraph));
+  unsigned int *components = (unsigned int*)malloc(sizeof(unsigned int));
+  run_cc<int, int, int>(graph_output, components, graph, 0, 1);
+  return *components == 1;
+}
+
 /**
  * @brief A simple CPU-based reference MST implementation.
  *
@@ -214,10 +240,10 @@ long long int SimpleReferenceMST(
 ////////////////////////////////////////////////////////////////////////////////
 template <typename VertexId, typename Value, typename SizeT, bool INSTRUMENT>
 void RunTests(
-  const Csr<VertexId, Value, SizeT> &graph,
+  const Csr<VertexId, Value, SizeT> & graph,
   int max_grid_size,
   int num_gpus,
-  mgpu::CudaContext& context)
+  mgpu::CudaContext & context)
 {
   printf("\nMINIMUM SPANNING TREE TEST\n");
 
@@ -411,13 +437,19 @@ int main(int argc, char** argv)
     * weight per edge. Note it only support FULLY-CONNECTED graphs *
     ***************************************************************/
 
-    // run GPU tests
-    RunTests(csr, args, *context);
-
+    // test graph connectivity and run test
+    if (IsConnected(csr))
+    {
+        RunTests(csr, args, *context);
+    }
+    else
+    {
+      fprintf(stderr, "Unsupported non-fully connected graph input.\n");
+    }
   }
   else
   {
-    fprintf(stderr, "Unspecified graph type\n");
+    fprintf(stderr, "Unspecified graph type.\n");
     return 1;
   }
 

From dd22c01038b47668a03f1c174864e3c52134f222 Mon Sep 17 00:00:00 2001
From: Yuduo Wu <yudwu@ucdavis.edu>
Date: Mon, 15 Jun 2015 13:22:31 -0700
Subject: [PATCH 08/36] template type minor change

---
 tests/mst/test_mst.cu | 45 +++++++++++++++++++------------------------
 1 file changed, 20 insertions(+), 25 deletions(-)

diff --git a/tests/mst/test_mst.cu b/tests/mst/test_mst.cu
index 4e295ad56..afefdab23 100644
--- a/tests/mst/test_mst.cu
+++ b/tests/mst/test_mst.cu
@@ -140,11 +140,10 @@ void DisplaySolution(const Csr<VertexId, Value, SizeT> &graph, int *mst_output)
 template<typename VertexId, typename Value, typename SizeT>
 bool IsConnected(const Csr<VertexId, Value, SizeT> & graph)
 {
-  // malloc output graph
-  GunrockGraph *graph_output =
-    (GunrockGraph*)malloc(sizeof(GunrockGraph));
+  GunrockGraph *temp = (GunrockGraph*)malloc(sizeof(GunrockGraph));
   unsigned int *components = (unsigned int*)malloc(sizeof(unsigned int));
-  run_cc<int, int, int>(graph_output, components, graph, 0, 1);
+  run_cc<VertexId, Value, SizeT>(temp, components, graph, 0, 1);
+  if (temp) free(temp);
   return *components == 1;
 }
 
@@ -269,7 +268,7 @@ void RunTests(
     "MST Problem Data Reset Failed", __FILE__, __LINE__);
 
   // perform MST
-  GpuTimer gpu_timer; // record the kernel running time
+  GpuTimer gpu_timer;  // record the kernel running time
 
   gpu_timer.Start();
 
@@ -287,7 +286,7 @@ void RunTests(
   util::GRError(mst_problem->Extract(h_mst_output),
     "MST Problem Data Extraction Failed", __FILE__, __LINE__);
 
-  if (!g_quick) // run CPU reference test
+  if (!g_quick)  // run CPU reference test
   {
     // calculate GPU final number of selected edges
     int num_selected_gpu = 0;
@@ -342,17 +341,16 @@ void RunTests(
  */
 template <typename VertexId, typename Value, typename SizeT>
 void RunTests(
-  const Csr<VertexId, Value, SizeT> &graph,
-  CommandLineArgs                   &args,
-  mgpu::CudaContext&                context)
+  const Csr<VertexId, Value, SizeT> & graph,
+  CommandLineArgs                   & args,
+  mgpu::CudaContext                 & context)
 {
-  bool instrumented  = false; // do not collect instrumentation from kernels
-  int  max_grid_size = 0;     // maximum grid size (up to the enactor)
-  int  num_gpus      = 1;     // number of GPUs for multi-gpu enactor to use
-  g_quick            = false; // Whether or not to skip ref validation
+  bool instrumented  = 0;  // do not collect instrumentation from kernels
+  int  max_grid_size = 0;  // maximum grid size (up to the enactor)
+  int  num_gpus      = 1;  // number of GPUs for multi-gpu enactor to use
+  g_quick            = 0;  // Whether or not to skip ref validation
 
   instrumented = args.CheckCmdLineFlag("instrumented");
-
   g_quick = args.CheckCmdLineFlag("quick");
   g_verbose = args.CheckCmdLineFlag("v");
 
@@ -405,9 +403,9 @@ int main(int argc, char** argv)
 
     // matrix-market coordinate-formatted graph file
 
-    typedef int VertexId; // use as the vertex identifier type
-    typedef int Value;    // use as the value type
-    typedef int SizeT;    // use as the graph size type
+    typedef int VertexId;  // use as the vertex identifier type
+    typedef int Value;     // use as the value type
+    typedef int SizeT;     // use as the graph size type
 
     // default value for stream_from_host is false
     if (graph_args < 1)
@@ -427,17 +425,14 @@ int main(int argc, char** argv)
       g_undirected,
       false) != 0) { return 1; }
 
-    // display graph
+    // display input graph
     // csr.DisplayGraph();
 
-    /***************************************************************
-    * To make sure two graphs have same weight value for each edge *
-    * we have to change ll_value = rand()%64 in market.cuh file to *
-    * some NON-RANDOM value if the original graph does NOT contain *
-    * weight per edge. Note it only support FULLY-CONNECTED graphs *
-    ***************************************************************/
+    /**************************************************************************
+     * Note: Minimum Spanning Tree only supports undirected, connected graphs *
+     **************************************************************************/
 
-    // test graph connectivity and run test
+    // test graph connectivity
     if (IsConnected(csr))
     {
         RunTests(csr, args, *context);

From 098df4fe0365efffb243aa5c4af2a96d517d98fb Mon Sep 17 00:00:00 2001
From: Yuduo Wu <yudwu@ucdavis.edu>
Date: Tue, 16 Jun 2015 11:24:17 -0700
Subject: [PATCH 09/36] added test script

---
 tests/mst/run.sh | 36 ++++++++++++++++++++++++++++++++++++
 1 file changed, 36 insertions(+)
 create mode 100644 tests/mst/run.sh

diff --git a/tests/mst/run.sh b/tests/mst/run.sh
new file mode 100644
index 000000000..694a12fc0
--- /dev/null
+++ b/tests/mst/run.sh
@@ -0,0 +1,36 @@
+#!/bin/sh
+
+OPTION="--quick"
+
+# --quick running without CPU reference algorithm, if you want to test CPU
+# reference algorithm, delete $OPTION2 in some lines. Warning: for large
+# data this can take a long time.
+
+# get all execution files in ./bin
+files=(./bin/*)
+
+# split file names into arr
+arr=$(echo $files | tr " " "\n")
+max_ver_num="$"
+exe_file=${arr[0]}
+
+# iterate over all file names to get the largest version number
+for x in $arr
+do
+    output=$(grep -o "[0-9]\.[0-9]" <<<"$x")
+    if [ "$output" \> "$max_ver_num" ]; then
+        exe_file=$x
+    fi
+done
+
+# put OS and Device here
+SUFFIX="ubuntu12.04.k40c"
+
+mkdir -p eval/$SUFFIX
+
+for i in belgium_osm coAuthorsDBLP delaunay_n13 delaunay_n21
+do
+    echo $exe_file market ../../dataset/large/$i/$i.mtx $OPTION
+         $exe_file market ../../dataset/large/$i/$i.mtx $OPTION > eval/$SUFFIX/$i.$SUFFIX.txt
+    sleep 1
+done

From 54f5673b1a68231da901bb40cc9405ddb125aef9 Mon Sep 17 00:00:00 2001
From: Yuduo Wu <yudwu@ucdavis.edu>
Date: Wed, 17 Jun 2015 11:09:39 -0700
Subject: [PATCH 10/36] data type changes, MST now supports float/double weight
 values

---
 gunrock/app/mst/mst_enactor.cuh | 299 +++-----------------------------
 gunrock/app/mst/mst_functor.cuh |  27 +--
 gunrock/app/mst/mst_problem.cuh |  69 ++++----
 gunrock/util/select_utils.cuh   | 216 ++++++++++-------------
 tests/mst/test_mst.cu           |  49 +++---
 5 files changed, 183 insertions(+), 477 deletions(-)

diff --git a/gunrock/app/mst/mst_enactor.cuh b/gunrock/app/mst/mst_enactor.cuh
index 7d7d7a0a1..4908b71b9 100644
--- a/gunrock/app/mst/mst_enactor.cuh
+++ b/gunrock/app/mst/mst_enactor.cuh
@@ -297,9 +297,9 @@ public:
           problem->data_slices[0]->d_keys_array,
           problem->data_slices[0]->d_edge_weights,
           graph_slice->edges,
-          std::numeric_limits<int>::max(),
-          mgpu::minimum<int>(),
-          mgpu::equal_to<int>(),
+          std::numeric_limits<Value>::max(),
+          mgpu::minimum<Value>(),
+          mgpu::equal_to<Value>(),
           problem->data_slices[0]->d_reduced_keys,
           problem->data_slices[0]->d_reduced_vals,
           &num_segments, (int*)0, context);
@@ -341,8 +341,8 @@ public:
         util::MemsetKernel<<<128, 128>>>(problem->data_slices[0]->d_successors,
           std::numeric_limits<int>::max(), graph_slice->nodes);
         util::MemsetKernel<<<128, 128>>>(
-          problem->data_slices[0]->d_temp_storage,
-          std::numeric_limits<int>::max(), graph_slice->nodes);
+          problem->data_slices[0]->d_temp_index,
+          std::numeric_limits<VertexId>::max(), graph_slice->nodes);
         util::MemsetIdxKernel<<<128, 128>>>(
           graph_slice->frontier_queues.d_keys[frontier_attribute.selector],
           graph_slice->nodes);
@@ -680,41 +680,41 @@ public:
         ////////////////////////////////////////////////////////////////////////
         // filter to remove all -1 in d_col_indices
         util::MemsetCopyVectorKernel<<<128, 128>>>(
-          problem->data_slices[0]->d_temp_storage,
+          problem->data_slices[0]->d_temp_index,
           problem->data_slices[0]->d_col_indices,
           graph_slice->edges);
         util::CUBSelect<VertexId, SizeT>(
-          problem->data_slices[0]->d_temp_storage, graph_slice->edges,
+          problem->data_slices[0]->d_temp_index, graph_slice->edges,
           problem->data_slices[0]->d_col_indices, num_selected);
 
         ////////////////////////////////////////////////////////////////////////
         // filter to remove all -1 in d_edge_weights
         util::MemsetCopyVectorKernel<<<128, 128>>>(
-          problem->data_slices[0]->d_temp_storage,
+          problem->data_slices[0]->d_temp_value,
           problem->data_slices[0]->d_edge_weights,
           graph_slice->edges);
-        util::CUBSelect<VertexId, SizeT>(
-          problem->data_slices[0]->d_temp_storage, graph_slice->edges,
+        util::CUBSelect<Value, SizeT>(
+          problem->data_slices[0]->d_temp_value, graph_slice->edges,
           problem->data_slices[0]->d_edge_weights, num_selected);
 
         ////////////////////////////////////////////////////////////////////////
         // filter to remove all -1 in d_keys_array
         util::MemsetCopyVectorKernel<<<128, 128>>>(
-          problem->data_slices[0]->d_temp_storage,
+          problem->data_slices[0]->d_temp_index,
           problem->data_slices[0]->d_keys_array,
           graph_slice->edges);
         util::CUBSelect<VertexId, SizeT>(
-          problem->data_slices[0]->d_temp_storage, graph_slice->edges,
+          problem->data_slices[0]->d_temp_index, graph_slice->edges,
           problem->data_slices[0]->d_keys_array, num_selected);
 
         ////////////////////////////////////////////////////////////////////////
         // filter to remove all -1 in d_origin_edges
         util::MemsetCopyVectorKernel<<<128, 128>>>(
-          problem->data_slices[0]->d_temp_storage,
+          problem->data_slices[0]->d_temp_index,
           problem->data_slices[0]->d_origin_edges,
           graph_slice->edges);
         util::CUBSelect<VertexId, SizeT>(
-          problem->data_slices[0]->d_temp_storage, graph_slice->edges,
+          problem->data_slices[0]->d_temp_index, graph_slice->edges,
           problem->data_slices[0]->d_origin_edges, num_selected);
 
         if (DEBUG) printf("  * finished remove edges in one super-vertex.\n");
@@ -785,12 +785,13 @@ public:
         ////////////////////////////////////////////////////////////////////////
         // bring edges, weights, origin_eids together according to keys
         util::MemsetCopyVectorKernel<<<128, 128>>>(
-          problem->data_slices[0]->d_temp_storage,
+          problem->data_slices[0]->d_temp_index,
           problem->data_slices[0]->d_keys_array,
           graph_slice->edges);
 
         util::MemsetCopyVectorKernel<<<128, 128>>>(
-          problem->data_slices[0]->d_tmp_storage,
+          //problem->data_slices[0]->d_temp_value,
+            problem->data_slices[0]->d_super_edges,  // used as temp_index
           problem->data_slices[0]->d_keys_array,
           graph_slice->edges);
 
@@ -801,276 +802,16 @@ public:
 
         util::CUBRadixSort<VertexId, Value>(
           true, graph_slice->edges,
-          problem->data_slices[0]->d_temp_storage,
+          problem->data_slices[0]->d_temp_index,
           problem->data_slices[0]->d_edge_weights);
 
         util::CUBRadixSort<VertexId, VertexId>(
           true, graph_slice->edges,
-          problem->data_slices[0]->d_tmp_storage,
+          //problem->data_slices[0]->d_temp_value,
+          problem->data_slices[0]->d_super_edges,  // used as temp_index
           problem->data_slices[0]->d_origin_edges);
 
         if (DEBUG) printf("  * finished sort according to new vertex ids.\n");
-
-        /*
-        ////////////////////////////////////////////////////////////////////////
-        // remove duplicated edges between super-vertices (optional operation)
-        if (false)//(enactor_stats.iteration == 0)
-        {
-          //////////////////////////////////////////////////////////////////////
-          // generate edge flag array based on source vertices list [1]
-          // using MarkSegmentFromKeys on d_keys_array
-          util::MemsetKernel<unsigned int><<<128, 128>>>(
-            problem->data_slices[0]->d_flags_array, 0, graph_slice->edges);
-          util::MarkSegmentFromKeys<<<128, 128>>>(
-            problem->data_slices[0]->d_flags_array,
-            problem->data_slices[0]->d_keys_array,
-            graph_slice->edges);
-
-          if (debug_info)
-          {
-            printf(":: mark segment to generate edge flag array [1] ::");
-            util::DisplayDeviceResults(
-              problem->data_slices[0]->d_flags_array, graph_slice->edges);
-          }
-
-          //////////////////////////////////////////////////////////////////////
-          // generate edge flag array based on destination vertices list [2]
-          // create a flags array on the output of segmented sort based on the
-          // difference in u-v pair using MarkSegmentsFromKeys kernel function
-          util::MarkSegmentFromKeys<<<128, 128>>>(
-            problem->data_slices[0]->d_edge_flags,
-            problem->data_slices[0]->d_col_indices,
-            graph_slice->edges);
-
-          if (debug_info)
-          {
-            printf(":: mark segment to generate edge flag array [2] ::");
-            util::DisplayDeviceResults(
-              problem->data_slices[0]->d_edge_flags, graph_slice->edges);
-          }
-
-          //////////////////////////////////////////////////////////////////////
-          // do or operation for d_edge_flags and d_flags_array - u-v pair
-          frontier_attribute.queue_index  = 0;
-          frontier_attribute.selector     = 0;
-          frontier_attribute.queue_length = graph_slice->edges;
-          frontier_attribute.queue_reset  = true;
-
-          gunrock::oprtr::filter::Kernel
-            <FilterKernelPolicy, MSTProblem, OrFunctor>
-            <<<enactor_stats.filter_grid_size, FilterKernelPolicy::THREADS>>>(
-            enactor_stats.iteration + 1,
-            frontier_attribute.queue_reset,
-            frontier_attribute.queue_index,
-            enactor_stats.num_gpus,
-            frontier_attribute.queue_length,
-            NULL,
-            graph_slice->frontier_queues.d_values[frontier_attribute.selector],
-            NULL,
-            graph_slice->frontier_queues.d_values[frontier_attribute.selector^1],
-            data_slice,
-            NULL,
-            work_progress,
-            graph_slice->frontier_elements[frontier_attribute.selector],
-            graph_slice->frontier_elements[frontier_attribute.selector^1],
-            enactor_stats.filter_kernel_stats);
-
-          if (DEBUG && (retval = util::GRError(cudaDeviceSynchronize(),
-            "filter::Kernel failed", __FILE__, __LINE__))) break;
-
-          if (DEBUG) printf("  * finished edge flags - second edge removal.\n");
-
-          if (debug_info)
-          {
-            printf(":: duplicated edges between super-vertex d_edge_flags ::");
-            util::DisplayDeviceResults(
-              problem->data_slices[0]->d_edge_flags, graph_slice->edges);
-            printf(":: edge removal u list (d_keys_array) ::");
-            util::DisplayDeviceResults(
-              problem->data_slices[0]->d_keys_array, graph_slice->edges);
-            printf(":: edge removal v list (d_col_indices) ::");
-            util::DisplayDeviceResults(
-              problem->data_slices[0]->d_col_indices, graph_slice->edges);
-            printf(":: edge removal w list (d_edge_weights) ::");
-            util::DisplayDeviceResults(
-              problem->data_slices[0]->d_edge_weights, graph_slice->edges);
-          }
-
-          //////////////////////////////////////////////////////////////////////
-          // scan edge_flags to get edge_keys used for sorting
-          Scan<MgpuScanTypeInc>(
-            (int*)problem->data_slices[0]->d_edge_flags, graph_slice->edges,
-            (int)0, mgpu::plus<int>(), (int*)0, (int*)0,
-            (int*)problem->data_slices[0]->d_temp_storage, context);
-
-          // set first bit of edge_flags back to 1
-          util::MemsetKernel<unsigned int><<<1, 1>>>(
-            problem->data_slices[0]->d_edge_flags, 1, 1);
-
-          //////////////////////////////////////////////////////////////////////
-          // calculate the number of segments for edge_offsets
-          num_segments = Reduce(
-            problem->data_slices[0]->d_edge_flags, graph_slice->edges, context);
-
-          //////////////////////////////////////////////////////////////////////
-          // generate edge_offsets used for SegSortFromIndices
-          // edge_flags stored in d_row_offsets
-          frontier_attribute.queue_index  = 0;
-          frontier_attribute.selector     = 0;
-          frontier_attribute.queue_length = graph_slice->edges;
-          frontier_attribute.queue_reset  = true;
-
-          gunrock::oprtr::filter::Kernel
-            <FilterKernelPolicy, MSTProblem, EIdxFunctor>
-            <<<enactor_stats.filter_grid_size, FilterKernelPolicy::THREADS>>>(
-            enactor_stats.iteration + 1,
-            frontier_attribute.queue_reset,
-            frontier_attribute.queue_index,
-            enactor_stats.num_gpus,
-            frontier_attribute.queue_length,
-            NULL,
-            graph_slice->frontier_queues.d_values[frontier_attribute.selector],
-            NULL,
-            graph_slice->frontier_queues.d_values[frontier_attribute.selector^1],
-            data_slice,
-            NULL,
-            work_progress,
-            graph_slice->frontier_elements[frontier_attribute.selector],
-            graph_slice->frontier_elements[frontier_attribute.selector^1],
-            enactor_stats.filter_kernel_stats);
-
-          if (DEBUG && (retval = util::GRError(cudaDeviceSynchronize(),
-            "filter::Kernel failed", __FILE__, __LINE__))) break;
-
-          //////////////////////////////////////////////////////////////////////
-          // segmented sort d_col_indices, d_edge_weights and d_origin_edges
-          // copy d_edge_weights to d_temp_storage to use for segmented sort
-          util::MemsetCopyVectorKernel<<<128, 128>>>(
-            problem->data_slices[0]->d_temp_storage,
-            problem->data_slices[0]->d_edge_weights,
-            graph_slice->edges);
-
-          util::SegSortFromIndices<SizeT, VertexId, Value>(
-            context,
-            num_segments,
-            problem->data_slices[0]->d_row_offsets,
-            graph_slice->edges,
-            problem->data_slices[0]->d_edge_weights,
-            problem->data_slices[0]->d_col_indices);
-
-          util::SegSortFromIndices<SizeT, VertexId, VertexId>(
-            context,
-            num_segments,
-            problem->data_slices[0]->d_row_offsets,
-            graph_slice->edges,
-            problem->data_slices[0]->d_temp_storage,
-            problem->data_slices[0]->d_origin_edges);
-
-          if (DEBUG) printf("  * finished segmentedSort for edge reduction.\n");
-
-          if (debug_info)
-          {
-            printf(":: second reduction segmented sort d_col_indices ::");
-            util::DisplayDeviceResults(
-              problem->data_slices[0]->d_col_indices, graph_slice->edges);
-            printf(":: second reduction segmented sort d_edge_weights ::");
-            util::DisplayDeviceResults(
-              problem->data_slices[0]->d_edge_weights, graph_slice->edges);
-            printf(":: second reduction segmented sort d_origin_edges ::");
-            util::DisplayDeviceResults(
-              problem->data_slices[0]->d_origin_edges, graph_slice->edges);
-          }
-
-          //////////////////////////////////////////////////////////////////////
-          // mark -1 to edges that needed to be removed using advance kernel
-          frontier_attribute.queue_index  = 0;
-          frontier_attribute.selector     = 0;
-          frontier_attribute.queue_length = graph_slice->edges;
-          frontier_attribute.queue_reset  = true;
-
-          gunrock::oprtr::filter::Kernel
-            <FilterKernelPolicy, MSTProblem, SuRmFunctor>
-            <<<enactor_stats.filter_grid_size, FilterKernelPolicy::THREADS>>>(
-            enactor_stats.iteration + 1,
-            frontier_attribute.queue_reset,
-            frontier_attribute.queue_index,
-            enactor_stats.num_gpus,
-            frontier_attribute.queue_length,
-            NULL,
-            graph_slice->frontier_queues.d_values[frontier_attribute.selector],
-            NULL,
-            graph_slice->frontier_queues.d_values[frontier_attribute.selector^1],
-            data_slice,
-            NULL,
-            work_progress,
-            graph_slice->frontier_elements[frontier_attribute.selector],
-            graph_slice->frontier_elements[frontier_attribute.selector^1],
-            enactor_stats.filter_kernel_stats);
-
-          if (DEBUG && (retval = util::GRError(cudaDeviceSynchronize(),
-            "filter::Kernel failed", __FILE__, __LINE__))) break;
-
-          if (DEBUG) printf("  * finished mark -1 for duplicated edges.\n");
-
-          //////////////////////////////////////////////////////////////////////
-          // filter to remove all -1 in d_col_indices
-          util::MemsetCopyVectorKernel<<<128, 128>>>(
-            problem->data_slices[0]->d_temp_storage,
-            problem->data_slices[0]->d_col_indices,
-            graph_slice->edges);
-          util::CUBSelect<VertexId, SizeT>(
-            problem->data_slices[0]->d_temp_storage,
-            graph_slice->edges,
-            problem->data_slices[0]->d_col_indices,
-            num_selected);
-
-          //////////////////////////////////////////////////////////////////////
-          // filter to remove all -1 in d_edge_weights
-          util::MemsetCopyVectorKernel<<<128, 128>>>(
-            problem->data_slices[0]->d_temp_storage,
-            problem->data_slices[0]->d_edge_weights,
-            graph_slice->edges);
-          util::CUBSelect<VertexId, SizeT>(
-            problem->data_slices[0]->d_temp_storage,
-            graph_slice->edges,
-            problem->data_slices[0]->d_edge_weights,
-            num_selected);
-
-          //////////////////////////////////////////////////////////////////////
-          // filter to remove all -1 in d_keys_array
-          util::MemsetCopyVectorKernel<<<128, 128>>>(
-            problem->data_slices[0]->d_temp_storage,
-            problem->data_slices[0]->d_keys_array,
-            graph_slice->edges);
-          util::CUBSelect<VertexId, SizeT>(
-            problem->data_slices[0]->d_temp_storage,
-            graph_slice->edges,
-            problem->data_slices[0]->d_keys_array,
-            num_selected);
-
-          //////////////////////////////////////////////////////////////////////
-          // filter to remove all -1 in d_origin_edges
-          util::MemsetCopyVectorKernel<<<128, 128>>>(
-            problem->data_slices[0]->d_temp_storage,
-            problem->data_slices[0]->d_origin_edges,
-            graph_slice->edges);
-          util::CUBSelect<VertexId, SizeT>(
-            problem->data_slices[0]->d_temp_storage,
-            graph_slice->edges,
-            problem->data_slices[0]->d_origin_edges,
-            num_selected);
-
-          if (DEBUG)
-            printf("  * finished remove edges between super-vertices.\n");
-
-          graph_slice->edges = *num_selected;
-
-          if (DEBUG)
-            printf("  * finished update #edges: %d [2]\n", graph_slice->edges);
-
-        } // end of removing duplicated edges between super-vertices
-        */
-
         if (DEBUG) printf(" (d). Constructing the Vertex List.\n");
 
         ////////////////////////////////////////////////////////////////////////
diff --git a/gunrock/app/mst/mst_functor.cuh b/gunrock/app/mst/mst_functor.cuh
index 1f5393571..7f38e5d31 100644
--- a/gunrock/app/mst/mst_functor.cuh
+++ b/gunrock/app/mst/mst_functor.cuh
@@ -129,7 +129,7 @@ struct EdgeFunctor
     VertexId e_id = 0, VertexId e_id_in = 0)
   {
     util::io::ModifiedStore<ProblemData::QUEUE_WRITE_MODIFIER>::St(
-      problem->d_origin_edges[e_id], problem->d_temp_storage + s_id);
+      problem->d_origin_edges[e_id], problem->d_temp_index + s_id);
   }
 };
 
@@ -185,7 +185,7 @@ struct MarkFunctor
   {
     // mark minimum spanning tree output edges
     util::io::ModifiedStore<ProblemData::QUEUE_WRITE_MODIFIER>::St(
-      1, problem->d_mst_output + problem->d_temp_storage[s_id]);
+      1, problem->d_mst_output + problem->d_temp_index[s_id]);
   }
 };
 
@@ -247,7 +247,7 @@ struct CyRmFunctor
 
     // remove some edges in the MST output result
     util::io::ModifiedStore<ProblemData::QUEUE_WRITE_MODIFIER>::St(
-      0, problem->d_mst_output + problem->d_temp_storage[s_id]);
+      0, problem->d_mst_output + problem->d_temp_index[s_id]);
   }
 };
 
@@ -364,13 +364,14 @@ struct EgRmFunctor
     VertexId e_id = 0, VertexId e_id_in = 0)
   {
     util::io::ModifiedStore<ProblemData::QUEUE_WRITE_MODIFIER>::St(
-      -1, problem->d_keys_array + e_id);
+      (VertexId)-1, problem->d_keys_array + e_id);
     util::io::ModifiedStore<ProblemData::QUEUE_WRITE_MODIFIER>::St(
-      -1, problem->d_col_indices + e_id);
+      (VertexId)-1, problem->d_col_indices + e_id);
+    //util::io::ModifiedStore<ProblemData::QUEUE_WRITE_MODIFIER>::St(
+    //  (Value)-1, problem->d_edge_weights + e_id);
+    problem->d_edge_weights[e_id] = (Value) -1;
     util::io::ModifiedStore<ProblemData::QUEUE_WRITE_MODIFIER>::St(
-      -1, problem->d_edge_weights + e_id);
-    util::io::ModifiedStore<ProblemData::QUEUE_WRITE_MODIFIER>::St(
-      -1, problem->d_origin_edges + e_id);
+      (VertexId)-1, problem->d_origin_edges + e_id);
   }
 
   /**
@@ -506,7 +507,7 @@ struct EIdxFunctor
     VertexId node, DataSlice *problem, Value v = 0, SizeT nid=0)
   {
     util::io::ModifiedStore<ProblemData::QUEUE_WRITE_MODIFIER>::St(
-      node, problem->d_row_offsets + problem->d_temp_storage[node]);
+      node, problem->d_row_offsets + problem->d_temp_index[node]);
   }
 };
 
@@ -607,13 +608,13 @@ struct SuRmFunctor
     VertexId node, DataSlice *problem, Value v = 0, SizeT nid=0)
   {
     util::io::ModifiedStore<ProblemData::QUEUE_WRITE_MODIFIER>::St(
-      -1, problem->d_keys_array + node);
+      (VertexId)-1, problem->d_keys_array + node);
     util::io::ModifiedStore<ProblemData::QUEUE_WRITE_MODIFIER>::St(
-      -1, problem->d_col_indices + node);
+      (VertexId)-1, problem->d_col_indices + node);
     util::io::ModifiedStore<ProblemData::QUEUE_WRITE_MODIFIER>::St(
-      -1, problem->d_edge_weights + node);
+      (Value)   -1, problem->d_edge_weights + node);
     util::io::ModifiedStore<ProblemData::QUEUE_WRITE_MODIFIER>::St(
-      -1, problem->d_origin_edges + node);
+      (VertexId)-1, problem->d_origin_edges + node);
   }
 };
 
diff --git a/gunrock/app/mst/mst_problem.cuh b/gunrock/app/mst/mst_problem.cuh
index b2b8e7f1f..9f4e3db6b 100644
--- a/gunrock/app/mst/mst_problem.cuh
+++ b/gunrock/app/mst/mst_problem.cuh
@@ -47,7 +47,7 @@ struct MSTProblem : ProblemBase<_VertexId, _SizeT, _USE_DOUBLE_BUFFER>
   typedef _SizeT    SizeT;
   typedef _Value    Value;
 
-  static const bool MARK_PREDECESSORS  = true;
+  static const bool MARK_PREDECESSORS  =  true;
   static const bool ENABLE_IDEMPOTENCE = false;
 
   // helper structures
@@ -71,10 +71,10 @@ struct MSTProblem : ProblemBase<_VertexId, _SizeT, _USE_DOUBLE_BUFFER>
     VertexId     *d_origin_edges; // origin edge list keep track of e_ids
     VertexId     *d_super_edges;  // super edge list for next iteration
     VertexId     *d_col_indices;  // column indices of CSR graph (edges)
+    VertexId     *d_temp_index;   // used for storing temp index
+    Value        *d_temp_value;   // used for storing temp value
     Value        *d_reduced_vals; // store reduced minimum weights
     Value        *d_edge_weights; // store weights per edge
-    Value        *d_temp_storage; // used for storing temp arrays
-    Value        *d_tmp_storage;  // used for storing temp arrays
     SizeT        *d_supervtx_ids; // super vertex ids scanned from flags
     SizeT        *d_row_offsets;  // row offsets of CSR graph
   };
@@ -107,10 +107,7 @@ struct MSTProblem : ProblemBase<_VertexId, _SizeT, _USE_DOUBLE_BUFFER>
    * @brief MSTProblem default constructor
    */
 
-  MSTProblem():
-  nodes(0),
-  edges(0),
-  num_gpus(0) {}
+  MSTProblem(): nodes(0), edges(0), num_gpus(0) {}
 
   /**
    * @brief MSTProblem constructor
@@ -153,9 +150,9 @@ struct MSTProblem : ProblemBase<_VertexId, _SizeT, _USE_DOUBLE_BUFFER>
       if (data_slices[i]->d_keys_array)
         util::GRError(cudaFree(data_slices[i]->d_keys_array),
           "GpuSlice cudaFree  d_keys_array  failed", __FILE__, __LINE__);
-      if (data_slices[i]->d_temp_storage)
-        util::GRError(cudaFree(data_slices[i]->d_temp_storage),
-          "GpuSlice cudaFree d_temp_storage failed", __FILE__, __LINE__);
+      if (data_slices[i]->d_temp_index)
+        util::GRError(cudaFree(data_slices[i]->d_temp_index),
+          "GpuSlice cudaFree  d_temp_index  failed", __FILE__, __LINE__);
       if (data_slices[i]->d_reduced_keys)
         util::GRError(cudaFree(data_slices[i]->d_reduced_keys),
           "GpuSlice cudaFree d_reduced_keys failed", __FILE__, __LINE__);
@@ -183,9 +180,9 @@ struct MSTProblem : ProblemBase<_VertexId, _SizeT, _USE_DOUBLE_BUFFER>
       if (data_slices[i]->d_edge_flags)
         util::GRError(cudaFree(data_slices[i]->d_edge_flags),
           "GpuSlice cudaFree  d_edge_flags  failed", __FILE__, __LINE__);
-      if (data_slices[i]->d_tmp_storage)
-        util::GRError(cudaFree(data_slices[i]->d_tmp_storage),
-          "GpuSlice cudaFree d_tmp_storage  failed", __FILE__, __LINE__);
+      if (data_slices[i]->d_temp_value)
+        util::GRError(cudaFree(data_slices[i]->d_temp_value),
+          "GpuSlice cudaFree  d_temp_value  failed", __FILE__, __LINE__);
       if (data_slices[i]->d_super_edges)
         util::GRError(cudaFree(data_slices[i]->d_super_edges),
           "GpuSlice cudaFree d_super_edges  failed", __FILE__, __LINE__);
@@ -348,7 +345,7 @@ struct MSTProblem : ProblemBase<_VertexId, _SizeT, _USE_DOUBLE_BUFFER>
           __FILE__, __LINE__)) return retval;
           data_slices[0]->d_reduced_vals = d_reduced_vals;
         util::MemsetKernel<<<128, 128>>>(
-          data_slices[0]->d_reduced_vals, 0, nodes);
+          data_slices[0]->d_reduced_vals, (Value)0, nodes);
 
         unsigned int *d_flags_array;
         if (retval = util::GRError(cudaMalloc(
@@ -370,15 +367,15 @@ struct MSTProblem : ProblemBase<_VertexId, _SizeT, _USE_DOUBLE_BUFFER>
         util::MemsetKernel<<<128, 128>>>(
           data_slices[0]->d_keys_array, 0, edges);
 
-        SizeT *d_temp_storage;
+        VertexId *d_temp_index;
         if (retval = util::GRError(cudaMalloc(
-          (void**)&d_temp_storage,
-          edges * sizeof(SizeT)),
-          "MSTProblem cudaMalloc d_temp_storage Failed",
+          (void**)&d_temp_index,
+          edges * sizeof(VertexId)),
+          "MSTProblem cudaMalloc d_temp_index Failed",
           __FILE__, __LINE__)) return retval;
-          data_slices[0]->d_temp_storage = d_temp_storage;
+          data_slices[0]->d_temp_index = d_temp_index;
         util::MemsetKernel<<<128, 128>>>(
-          data_slices[0]->d_temp_storage, 0, edges);
+          data_slices[0]->d_temp_index, (VertexId)0, edges);
 
         VertexId *d_reduced_keys;
         if (retval = util::GRError(cudaMalloc(
@@ -473,15 +470,15 @@ struct MSTProblem : ProblemBase<_VertexId, _SizeT, _USE_DOUBLE_BUFFER>
         util::MemsetKernel<unsigned int><<<128, 128>>>(
           data_slices[0]->d_edge_flags, 0, edges);
 
-        Value *d_tmp_storage;
+        Value *d_temp_value;
         if (retval = util::GRError(cudaMalloc(
-          (void**)&d_tmp_storage,
+          (void**)&d_temp_value,
           edges * sizeof(Value)),
-          "MSTProblem cudaMalloc d_tmp_storage Failed",
+          "MSTProblem cudaMalloc d_temp_value Failed",
           __FILE__, __LINE__)) return retval;
-          data_slices[0]->d_tmp_storage = d_tmp_storage;
+          data_slices[0]->d_temp_value = d_temp_value;
         util::MemsetKernel<<<128, 128>>>(
-          data_slices[0]->d_tmp_storage, 0, edges);
+          data_slices[0]->d_temp_value, (Value)0, edges);
 
         data_slices[0]->d_labels = NULL;
       }
@@ -576,14 +573,14 @@ struct MSTProblem : ProblemBase<_VertexId, _SizeT, _USE_DOUBLE_BUFFER>
         data_slices[gpu]->d_keys_array = d_keys_array;
       }
 
-      if (!data_slices[gpu]->d_temp_storage)
+      if (!data_slices[gpu]->d_temp_index)
       {
-        SizeT *d_temp_storage;
+        VertexId *d_temp_index;
         if (retval = util::GRError(cudaMalloc(
-          (void**)&d_temp_storage, edges * sizeof(SizeT)),
-          "MSTProblem cudaMalloc d_temp_storage Failed",
+          (void**)&d_temp_index, edges * sizeof(VertexId)),
+          "MSTProblem cudaMalloc d_temp_index Failed",
           __FILE__, __LINE__)) return retval;
-        data_slices[gpu]->d_temp_storage = d_temp_storage;
+        data_slices[gpu]->d_temp_index = d_temp_index;
       }
 
       if (!data_slices[gpu]->d_successors)
@@ -685,14 +682,14 @@ struct MSTProblem : ProblemBase<_VertexId, _SizeT, _USE_DOUBLE_BUFFER>
         data_slices[gpu]->d_edge_flags = d_edge_flags;
       }
 
-      if (!data_slices[gpu]->d_tmp_storage)
+      if (!data_slices[gpu]->d_temp_value)
       {
-        Value *d_tmp_storage;
+        Value *d_temp_value;
         if (retval = util::GRError(cudaMalloc(
-          (void**)&d_tmp_storage, edges * sizeof(Value)),
-          "MSTProblem cudaMalloc d_tmp_storage Failed",
+          (void**)&d_temp_value, edges * sizeof(Value)),
+          "MSTProblem cudaMalloc d_temp_value Failed",
           __FILE__, __LINE__)) return retval;
-        data_slices[gpu]->d_tmp_storage = d_tmp_storage;
+        data_slices[gpu]->d_temp_value = d_temp_value;
       }
 
       data_slices[0]->d_labels = NULL;
@@ -727,4 +724,4 @@ struct MSTProblem : ProblemBase<_VertexId, _SizeT, _USE_DOUBLE_BUFFER>
 // Local Variables:
 // mode:c++
 // c-file-style: "NVIDIA"
-// End:
\ No newline at end of file
+// End:
diff --git a/gunrock/util/select_utils.cuh b/gunrock/util/select_utils.cuh
index 2db66ca14..1da71e8fa 100644
--- a/gunrock/util/select_utils.cuh
+++ b/gunrock/util/select_utils.cuh
@@ -18,136 +18,104 @@
 namespace gunrock {
 namespace util {
 
-    /**
-     * \addtogroup PublicInterface
-     * @{
-     */
-
-    //---------------------------------------------------------------------
-    // Globals, constants and typedefs
-    //---------------------------------------------------------------------
-    struct GreaterThan
-    {
-	int compare;
-
-	__host__ __device__ __forceinline__
-	GreaterThan(int compare) : compare(compare) { }
-
-	__host__ __device__ __forceinline__
-	bool operator()(const int &a) const { return (a > compare); }
-    };
-
-    /**
-     * @brief selects items from from a sequence of int keys using a
-     * section functor (greater-than)
-     *
-     */
-    template <typename VertexId, typename SizeT>
-    cudaError_t CUBSelect(
-	VertexId  *d_input,
-	SizeT     num_elements,
-	VertexId  *d_output,
-	unsigned int *num_selected)
-    {
-	cudaError_t retval = cudaSuccess;
-
-	/*
-	  VertexId *input  = NULL;
-	  VertexId *output = NULL;
-
-	  if (util::GRError((retval = cudaMalloc(
-	  &input, sizeof(VertexId)*d_num_elements)),
-	  "CUBSelect input malloc failed",
-	  __FILE__, __LINE__)) return retval;
-	  if (util::GRError((retval = cudaMalloc(
-	  &output, sizeof(VertexId)*d_num_elements)),
-	  "CUBSelect output malloc failed",
-	  __FILE__, __LINE__)) return retval;
-
-	  cub::DoubleBuffer<VertexId> d_input_buffer(d_input, input);
-	  cub::DoubleBuffer<VertexId> d_output_buffer(d_output, output);
-	*/
-
-	unsigned int *d_num_selected = NULL;
-	if (util::GRError((retval = cudaMalloc(
-	    (void**)&d_num_selected, sizeof(unsigned int))),
-	    "CUBSelect d_num_selected malloc failed",
-	    __FILE__, __LINE__)) return retval;
-
-	void  *d_temp_storage = NULL;
-	size_t temp_storage_bytes = 0;
-	GreaterThan select_op(-1);
-
-	// determine temporary device storage requirements
-	if (util::GRError((retval = cub::DeviceSelect::If(
-	    d_temp_storage,
-	    temp_storage_bytes,
-	    d_input,
-	    d_output,
-	    d_num_selected,
-	    num_elements,
-	    select_op)),
-	    "CUBSelect cub::DeviceSelect::If failed",
-	    __FILE__, __LINE__)) return retval;
-
-	// allocate temporary storage
-	if (util::GRError((retval = cudaMalloc(
-	    &d_temp_storage, temp_storage_bytes)),
-	    "CUBSelect malloc d_temp_storage failed",
-	    __FILE__, __LINE__)) return retval;
-
-	// run selection
-	if (util::GRError((retval = cub::DeviceSelect::If(
-	    d_temp_storage,
-	    temp_storage_bytes,
-	    d_input,
-	    d_output,
-	    d_num_selected,
-	    num_elements,
-	    select_op)),
+/**
+ * \addtogroup PublicInterface
+ * @{
+ */
+
+//---------------------------------------------------------------------
+// Globals, constants and typedefs
+//---------------------------------------------------------------------
+struct GreaterThan
+{
+    int compare;
+
+    __host__ __device__ __forceinline__
+    GreaterThan(int compare) : compare(compare) { }
+
+    __host__ __device__ __forceinline__
+    bool operator()(const int &a) const { return (a > compare); }
+};
+
+/**
+ * @brief selects items from from a sequence of int keys using a
+ * section functor (greater-than)
+ *
+ */
+template <typename T, typename SizeT>
+cudaError_t CUBSelect(
+    T            *d_input,
+    SizeT         num_elements,
+    T     *d_output,
+    unsigned int *num_selected)
+{
+    cudaError_t retval = cudaSuccess;
+    unsigned int *d_num_selected = NULL;
+
+    if (util::GRError(
+            (retval = cudaMalloc((void**)&d_num_selected, sizeof(unsigned int))),
+            "CUBSelect d_num_selected malloc failed",
+            __FILE__, __LINE__)) return retval;
+
+    void *d_temp_storage = NULL;
+    size_t temp_storage_bytes = 0;
+    GreaterThan select_op(-1);
+
+    // determine temporary device storage requirements
+    if (util::GRError(
+            (retval = cub::DeviceSelect::If(
+                d_temp_storage,
+                temp_storage_bytes,
+                d_input,
+                d_output,
+                d_num_selected,
+                num_elements,
+                select_op)),
             "CUBSelect cub::DeviceSelect::If failed",
             __FILE__, __LINE__)) return retval;
 
-	/*
-	// copy back output
-	if (util::GRError((retval = cudaMemcpy(
-	d_output,
-	d_output_buffer.Current(),
-	sizeof(VertexId)*(*d_num_selected),
-	cudaMemcpyDeviceToDevice)),
-	"CUBSelect copy back output failed",
-	__FILE__, __LINE__)) return retval;
-	*/
-
-	if (util::GRError((retval = cudaMemcpy(
-	    num_selected,
-	    d_num_selected,
-	    sizeof(unsigned int),
-	    cudaMemcpyDeviceToHost)),
-	    "CUBSelect copy back num_selected failed",
-	    __FILE__, __LINE__)) return retval;
-
-	// clean up
-	if (util::GRError((retval = cudaFree(d_temp_storage)),
-	    "CUBSelect free d_temp_storage failed",
-	    __FILE__, __LINE__)) return retval;
-	if (util::GRError((retval = cudaFree(d_num_selected)),
-            "CUBSelect free d_num_selected failed",
-	    __FILE__, __LINE__)) return retval;
+    // allocate temporary storage
+    if (util::GRError(
+            (retval = cudaMalloc(&d_temp_storage, temp_storage_bytes)),
+            "CUBSelect malloc d_temp_storage failed",
+            __FILE__, __LINE__)) return retval;
+
+    // run selection
+    if (util::GRError(
+            (retval = cub::DeviceSelect::If(
+                d_temp_storage,
+                temp_storage_bytes,
+                d_input,
+                d_output,
+                d_num_selected,
+                num_elements,
+                select_op)),
+            "CUBSelect cub::DeviceSelect::If failed",
+            __FILE__, __LINE__)) return retval;
 
-	/*
-	  if (util::GRError((retval = cudaFree(input)),
-	  "CUBSelect free input failed",
-	  __FILE__, __LINE__)) return retval;
-	  if (util::GRError((retval = cudaFree(output)),
-	  "CUBSelect free output failed",
-	  __FILE__, __LINE__)) return retval;
-	*/
+    if (util::GRError(
+            (retval = cudaMemcpy(
+                num_selected,
+                d_num_selected,
+                sizeof(unsigned int),
+                cudaMemcpyDeviceToHost)),
+            "CUBSelect copy back num_selected failed",
+            __FILE__, __LINE__)) return retval;
+
+    // clean up
+    if (util::GRError(
+            (retval = cudaFree(d_temp_storage)),
+            "CUBSelect free d_temp_storage failed",
+            __FILE__, __LINE__)) return retval;
+    if (util::GRError(
+            (retval = cudaFree(d_num_selected)),
+            "CUBSelect free d_num_selected failed",
+            __FILE__, __LINE__)) return retval;
 
-	return retval;
-    }
+    return retval;
+}
 
-    /** @} */
+/** @} */
 
 } //util
 } //gunrock
diff --git a/tests/mst/test_mst.cu b/tests/mst/test_mst.cu
index afefdab23..d74e89233 100644
--- a/tests/mst/test_mst.cu
+++ b/tests/mst/test_mst.cu
@@ -161,18 +161,18 @@ bool IsConnected(const Csr<VertexId, Value, SizeT> & graph)
  */
 ////////////////////////////////////////////////////////////////////////////////
 template<typename VertexId, typename Value, typename SizeT>
-long long int SimpleReferenceMST(
+Value SimpleReferenceMST(
   const Value *edge_values, const Csr<VertexId, Value, SizeT> &graph)
 {
-  printf("\nREFERENCE TEST\n");
+  printf("\nMST CPU REFERENCE TEST\n");
 
   // Kruskal minimum spanning tree preparations
   using namespace boost;
-  typedef adjacency_list < vecS, vecS, undirectedS,
-    no_property, property < edge_weight_t, int > >  Graph;
+  typedef adjacency_list< vecS, vecS, undirectedS,
+    no_property, property<edge_weight_t, float> >   Graph;
   typedef graph_traits < Graph >::edge_descriptor   Edge;
   typedef graph_traits < Graph >::vertex_descriptor Vertex;
-  typedef std::pair<int, int> E;
+  typedef std::pair<VertexId, VertexId> E;
 
   E *edge_pairs = new E[graph.edges];
   int idx = 0;
@@ -190,16 +190,18 @@ long long int SimpleReferenceMST(
 
   CpuTimer cpu_timer; // record the kernel running time
   cpu_timer.Start();
+
   // compute reference using kruskal_min_spanning_tree algorithm
   kruskal_minimum_spanning_tree(g, std::back_inserter(spanning_tree));
+
   cpu_timer.Stop();
   float elapsed_cpu = cpu_timer.ElapsedMillis();
 
   // analyze reference results
-  SizeT         num_selected_cpu = 0;
-  long long int total_weight_cpu = 0;
+  SizeT num_selected_cpu = 0;
+  Value total_weight_cpu = 0;
 
-  if (graph.nodes <= 50) printf("CPU Minimum Spanning Tree\n");
+  if (graph.nodes <= 50) { printf("CPU Minimum Spanning Tree\n"); }
   for (std::vector < Edge >::iterator ei = spanning_tree.begin();
        ei != spanning_tree.end(); ++ei)
   {
@@ -207,7 +209,7 @@ long long int SimpleReferenceMST(
     {
       // print the edge pairs in the minimum spanning tree
       printf("%ld %ld\n", source(*ei, g), target(*ei, g));
-      // printf("  with weight of %d\n", weight[*ei]);
+      // printf("  with weight of %f\n", weight[*ei]);
     }
     ++num_selected_cpu;
     total_weight_cpu += weight[*ei];
@@ -297,27 +299,27 @@ void RunTests(
     // printf("\nGPU - Number of Edges in MST: %d\n", num_selected_gpu);
 
     // calculate GPU total selected MST weights for validation
-    long long int total_weight_gpu = 0;
+    Value total_weight_gpu = 0;
     for (int iter = 0; iter < graph.edges; ++iter)
     {
       total_weight_gpu += h_mst_output[iter] * graph.edge_values[iter];
     }
 
     // correctness validation
-    long long int total_weight_cpu =
-      SimpleReferenceMST(graph.edge_values, graph);
+    Value total_weight_cpu = SimpleReferenceMST(graph.edge_values, graph);
     if (total_weight_cpu == total_weight_gpu)
     {
       // print the edge pairs in the minimum spanning tree
       DisplaySolution(graph, h_mst_output);
       printf("\nCORRECT.\n");
+      std::cout << "CPU Computed Total Weight = " << total_weight_cpu << std::endl;
+      std::cout << "GPU Computed Total Weight = " << total_weight_gpu << std::endl;
     }
     else
     {
-      printf("INCORRECT. \n"
-             "CPU Computed Total Weight = %lld\n"
-             "GPU Computed Total Weight = %lld\n",
-             total_weight_cpu, total_weight_gpu);
+      printf("INCORRECT.\n");
+      std::cout << "CPU Computed Total Weight = " << total_weight_cpu << std::endl;
+      std::cout << "GPU Computed Total Weight = " << total_weight_gpu << std::endl;
     }
   }
 
@@ -400,12 +402,12 @@ int main(int argc, char** argv)
 
   if (graph_type == "market")
   {
-
     // matrix-market coordinate-formatted graph file
 
-    typedef int VertexId;  // use as the vertex identifier type
+    // currently support Value type: int, float, double
+    typedef int VertexId;  // use as the vertex identifier
     typedef int Value;     // use as the value type
-    typedef int SizeT;     // use as the graph size type
+    typedef int SizeT;     // use as the graph size
 
     // default value for stream_from_host is false
     if (graph_args < 1)
@@ -420,13 +422,10 @@ int main(int argc, char** argv)
     // template argument = true because the graph has edge values
     Csr<VertexId, Value, SizeT> csr(false);
     if (graphio::BuildMarketGraph<true>(
-      market_filename,
-      csr,
-      g_undirected,
-      false) != 0) { return 1; }
+      market_filename, csr, g_undirected, false) != 0) { return 1; }
 
     // display input graph
-    // csr.DisplayGraph();
+    // csr.DisplayGraph(true);
 
     /**************************************************************************
      * Note: Minimum Spanning Tree only supports undirected, connected graphs *
@@ -455,4 +454,4 @@ int main(int argc, char** argv)
 // Local Variables:
 // mode:c++
 // c-file-style: "NVIDIA"
-// End:
+// End

From 69b922725851d862933d3d3a4da2398a4d901de0 Mon Sep 17 00:00:00 2001
From: Yuduo Wu <yudwu@ucdavis.edu>
Date: Wed, 17 Jun 2015 11:10:44 -0700
Subject: [PATCH 11/36] minor template type fix for parsing input graph

---
 gunrock/csr.cuh            | 192 ++++++++++++++++---------------------
 gunrock/graphio/market.cuh |  68 ++++++-------
 2 files changed, 108 insertions(+), 152 deletions(-)

diff --git a/gunrock/csr.cuh b/gunrock/csr.cuh
index 5e9cc8e4e..d4d5da2ff 100644
--- a/gunrock/csr.cuh
+++ b/gunrock/csr.cuh
@@ -34,8 +34,7 @@ namespace gunrock {
  * the graph as a sparse matrix.
  */
 template<typename VertexId, typename Value, typename SizeT>
-struct Csr
-{
+struct Csr {
     SizeT nodes;     /**< Number of nodes in the graph. */
     SizeT edges;     /**< Number of edges in the graph. */
     SizeT out_nodes; /**< Number of nodes which have outgoing edges. */
@@ -57,8 +56,7 @@ struct Csr
      * @param[in] pinned Use pinned memory for CSR data structure
      * (default: do not use pinned memory)
      */
-    Csr(bool pinned = false)
-    {
+    Csr(bool pinned = false) {
         nodes = 0;
         edges = 0;
         average_degree = 0;
@@ -79,8 +77,7 @@ struct Csr
      * @param[in] edges Number of edges in COO-format graph
      */
     template <bool LOAD_EDGE_VALUES, bool LOAD_NODE_VALUES>
-    void FromScratch(SizeT nodes, SizeT edges)
-    {
+    void FromScratch(SizeT nodes, SizeT edges) {
         this->nodes = nodes;
         this->edges = edges;
 
@@ -89,32 +86,32 @@ struct Csr
             // Put our graph in pinned memory
             int flags = cudaHostAllocMapped;
             if (gunrock::util::GRError(
-                    cudaHostAlloc((void **)&row_offsets,
-                                  sizeof(SizeT) * (nodes + 1), flags),
-                    "Csr cudaHostAlloc row_offsets failed", __FILE__, __LINE__))
+                        cudaHostAlloc((void **)&row_offsets,
+                                      sizeof(SizeT) * (nodes + 1), flags),
+                        "Csr cudaHostAlloc row_offsets failed", __FILE__, __LINE__))
                 exit(1);
             if (gunrock::util::GRError(
-                    cudaHostAlloc((void **)&column_indices,
-                                  sizeof(VertexId) * edges, flags),
-                    "Csr cudaHostAlloc column_indices failed",
-                    __FILE__, __LINE__))
+                        cudaHostAlloc((void **)&column_indices,
+                                      sizeof(VertexId) * edges, flags),
+                        "Csr cudaHostAlloc column_indices failed",
+                        __FILE__, __LINE__))
                 exit(1);
 
             if (LOAD_NODE_VALUES) {
                 if (gunrock::util::GRError(
-                        cudaHostAlloc((void **)&node_values,
-                                      sizeof(Value) * nodes, flags),
-                        "Csr cudaHostAlloc node_values failed",
-                        __FILE__, __LINE__))
+                            cudaHostAlloc((void **)&node_values,
+                                          sizeof(Value) * nodes, flags),
+                            "Csr cudaHostAlloc node_values failed",
+                            __FILE__, __LINE__))
                     exit(1);
             }
 
             if (LOAD_EDGE_VALUES) {
                 if (gunrock::util::GRError(
-                        cudaHostAlloc((void **)&edge_values,
-                                      sizeof(Value) * edges, flags),
-                        "Csr cudaHostAlloc edge_values failed",
-                        __FILE__, __LINE__))
+                            cudaHostAlloc((void **)&edge_values,
+                                          sizeof(Value) * edges, flags),
+                            "Csr cudaHostAlloc edge_values failed",
+                            __FILE__, __LINE__))
                     exit(1);
             }
 
@@ -124,9 +121,9 @@ struct Csr
             row_offsets = (SizeT*) malloc(sizeof(SizeT) * (nodes + 1));
             column_indices = (VertexId*) malloc(sizeof(VertexId) * edges);
             node_values = (LOAD_NODE_VALUES) ?
-                (Value*) malloc(sizeof(Value) * nodes) : NULL;
+                          (Value*) malloc(sizeof(Value) * nodes) : NULL;
             edge_values = (LOAD_EDGE_VALUES) ?
-                (Value*) malloc(sizeof(Value) * edges) : NULL;
+                          (Value*) malloc(sizeof(Value) * edges) : NULL;
         }
     }
 
@@ -143,20 +140,17 @@ struct Csr
         SizeT    num_edges,
         SizeT    *row_offsets,
         VertexId *col_indices,
-        Value    *edge_values = NULL)
-    {
+        Value    *edge_values = NULL) {
         std::ofstream output(file_name);
-        if (output.is_open())
-        {
+        if (output.is_open()) {
             output << num_nodes << " " << num_edges << " ";
             std::copy(row_offsets, row_offsets + num_nodes + 1,
-                      std::ostream_iterator<int>(output, " "));
+                      std::ostream_iterator<VertexId>(output, " "));
             std::copy(column_indices, column_indices + num_edges,
-                      std::ostream_iterator<int>(output, " "));
-            if (edge_values != NULL)
-            {
+                      std::ostream_iterator<VertexId>(output, " "));
+            if (edge_values != NULL) {
                 std::copy(edge_values, edge_values + num_edges,
-                          std::ostream_iterator<int>(output, " "));
+                          std::ostream_iterator<Value>(output, " "));
             }
             output.close();
         }
@@ -168,21 +162,20 @@ struct Csr
      *
      */
     template <bool LOAD_EDGE_VALUES>
-    void FromCsr(char *f_in, bool undirected, bool reversed)
-    {
+    void FromCsr(char *f_in, bool undirected, bool reversed) {
         printf("  Reading directly from previously stored CSR arrays ...\n");
 
         std::ifstream _file(f_in);
 
-        if (_file.is_open())
-        {
+        if (_file.is_open()) {
             time_t mark1 = time(NULL);
 
-            std::istream_iterator<int> start(_file), end;
-            std::vector<int> v(start, end);
+            std::istream_iterator<Value> start(_file), end;
+            std::vector<Value> v(start, end);
+
+            SizeT csr_nodes = v[0];
+            SizeT csr_edges = v[1];
 
-            SizeT csr_nodes = v.at(0);
-            SizeT csr_edges = v.at(1);
 
             FromScratch<LOAD_EDGE_VALUES, false>(csr_nodes, csr_edges);
 
@@ -190,8 +183,7 @@ struct Csr
             std::copy(v.begin() + 3 + csr_nodes,
                       v.begin() + 3 + csr_nodes + csr_edges,
                       column_indices);
-            if(LOAD_EDGE_VALUES)
-            {
+            if (LOAD_EDGE_VALUES) {
                 std::copy(v.begin() + 3 + csr_nodes + csr_edges,
                           v.end(), edge_values);
             }
@@ -200,27 +192,20 @@ struct Csr
             printf("Done reading (%ds).\n", (int) (mark2 - mark1));
 
             v.clear();
-        }
-        else
-        {
+        } else {
             perror("Unable To Open The File.");
         }
 
         // compute out_nodes
         SizeT out_node = 0;
-        for (SizeT node = 0; node < nodes; node++)
-        {
-            if (row_offsets[node+1] - row_offsets[node] > 0)
-            {
+        for (SizeT node = 0; node < nodes; node++) {
+            if (row_offsets[node + 1] - row_offsets[node] > 0) {
                 ++out_node;
             }
         }
         out_nodes = out_node;
-
-        fflush(stdout);
     }
 
-
     /**
      * @brief Build CSR graph from COO graph, sorted or unsorted
      *
@@ -241,8 +226,7 @@ struct Csr
         SizeT coo_edges,
         bool  ordered_rows = false,
         bool  undirected = false,
-        bool  reversed = false)
-    {
+        bool  reversed = false) {
         printf("  Converting %d vertices, %d directed edges (%s tuples) "
                "to CSR format...\n",
                coo_nodes, coo_edges, ordered_rows ? "ordered" : "unordered");
@@ -259,20 +243,18 @@ struct Csr
         Tuple *new_coo = (Tuple*) malloc(sizeof(Tuple) * coo_edges);
         SizeT real_edge = 0;
         if (coo[0].col != coo[0].row) {
-          new_coo[0].row = coo[0].row;
-          new_coo[0].col = coo[0].col;
-          new_coo[0].val = coo[0].val;
-          real_edge++;
+            new_coo[0].row = coo[0].row;
+            new_coo[0].col = coo[0].col;
+            new_coo[0].val = coo[0].val;
+            real_edge++;
         }
-        for (int i = 0; i < coo_edges-1; ++i)
-        {
-            if (((coo[i+1].col != coo[i].col) ||
-                 (coo[i+1].row != coo[i].row)) &&
-                (coo[i+1].col != coo[i+1].row))
-            {
-                new_coo[real_edge].col = coo[i+1].col;
-                new_coo[real_edge].row = coo[i+1].row;
-                new_coo[real_edge++].val = coo[i+1].val;
+        for (int i = 0; i < coo_edges - 1; ++i) {
+            if (((coo[i + 1].col != coo[i].col) ||
+                    (coo[i + 1].row != coo[i].row)) &&
+                    (coo[i + 1].col != coo[i + 1].row)) {
+                new_coo[real_edge].col = coo[i + 1].col;
+                new_coo[real_edge].row = coo[i + 1].row;
+                new_coo[real_edge++].val = coo[i + 1].val;
             }
         }
 
@@ -303,13 +285,10 @@ struct Csr
         printf("Done converting (%ds).\n", (int)(mark2 - mark1));
 
         // Write offsets, indices, node, edges etc. into file
-        if (LOAD_EDGE_VALUES)
-        {
+        if (LOAD_EDGE_VALUES) {
             WriteToFile(output_file, undirected, reversed, nodes, edges,
                         row_offsets, column_indices, edge_values);
-        }
-        else
-        {
+        } else {
             WriteToFile(output_file, undirected, reversed, nodes, edges,
                         row_offsets, column_indices);
         }
@@ -320,8 +299,7 @@ struct Csr
         // Compute out_nodes
         SizeT out_node = 0;
         for (SizeT node = 0; node < nodes; node++) {
-            if (row_offsets[node+1] - row_offsets[node] > 0)
-            {
+            if (row_offsets[node + 1] - row_offsets[node] > 0) {
                 ++out_node;
             }
         }
@@ -336,8 +314,7 @@ struct Csr
     /**
      * @brief Print log-scale degree histogram of the graph.
      */
-    void PrintHistogram()
-    {
+    void PrintHistogram() {
         fflush(stdout);
 
         // Initialize
@@ -380,18 +357,17 @@ struct Csr
     /**
      * @brief Display CSR graph to console
      */
-    void DisplayGraph(bool with_edge_value = false)
-    {
-        SizeT displayed_node_num = (nodes > 40) ? 40:nodes;
+    void DisplayGraph(bool with_edge_value = false) {
+        SizeT displayed_node_num = (nodes > 40) ? 40 : nodes;
         printf("First %d nodes's neighbor list of the input graph:\n",
                displayed_node_num);
         for (SizeT node = 0; node < displayed_node_num; node++) {
             util::PrintValue(node);
             printf(":");
             for (SizeT edge = row_offsets[node];
-                 edge < row_offsets[node + 1];
-                 edge++) {
-                 printf("[");
+                    edge < row_offsets[node + 1];
+                    edge++) {
+                printf("[");
                 util::PrintValue(column_indices[edge]);
                 if (with_edge_value) {
                     printf(",");
@@ -403,23 +379,22 @@ struct Csr
         }
     }
 
-    bool CheckValue()
-    {
+    bool CheckValue() {
         for (SizeT node = 0; node < nodes; ++node) {
             for (SizeT edge = row_offsets[node];
-                 edge < row_offsets[node+1];
-                 ++edge) {
-                 int src_node = node;
-                 int dst_node = column_indices[edge];
-                 int edge_value = edge_values[edge];
-                 for (SizeT r_edge = row_offsets[dst_node];
-                 r_edge < row_offsets[dst_node+1];
-                 ++r_edge) {
+                    edge < row_offsets[node + 1];
+                    ++edge) {
+                int src_node = node;
+                int dst_node = column_indices[edge];
+                int edge_value = edge_values[edge];
+                for (SizeT r_edge = row_offsets[dst_node];
+                        r_edge < row_offsets[dst_node + 1];
+                        ++r_edge) {
                     if (column_indices[r_edge] == src_node) {
                         if (edge_values[r_edge] != edge_value)
                             return false;
                     }
-                 }
+                }
             }
         }
         return true;
@@ -428,14 +403,12 @@ struct Csr
     /**
      * @brief Find node with largest neighbor list
      */
-    int GetNodeWithHighestDegree(int& max_degree)
-    {
+    int GetNodeWithHighestDegree(int& max_degree) {
         int degree = 0;
         int src = 0;
         for (SizeT node = 0; node < nodes; node++) {
-            if (row_offsets[node+1] - row_offsets[node] > degree)
-            {
-                degree = row_offsets[node+1]-row_offsets[node];
+            if (row_offsets[node + 1] - row_offsets[node] > degree) {
+                degree = row_offsets[node + 1] - row_offsets[node];
                 src = node;
             }
         }
@@ -446,16 +419,15 @@ struct Csr
     /**
      * @brief Display the neighbor list of a given node
      */
-    void DisplayNeighborList(VertexId node)
-    {
+    void DisplayNeighborList(VertexId node) {
         if (node < 0 || node >= nodes) return;
         for (SizeT edge = row_offsets[node];
-                 edge < row_offsets[node + 1];
-                 edge++) {
-                util::PrintValue(column_indices[edge]);
-                printf(", ");
-            }
-            printf("\n");
+                edge < row_offsets[node + 1];
+                edge++) {
+            util::PrintValue(column_indices[edge]);
+            printf(", ");
+        }
+        printf("\n");
     }
 
     /**
@@ -466,7 +438,7 @@ struct Csr
             double mean = 0, count = 0;
             for (SizeT node = 0; node < nodes; ++node) {
                 count += 1;
-                mean += (row_offsets[node+1]- row_offsets[node] - mean) / count;
+                mean += (row_offsets[node + 1] - row_offsets[node] - mean) / count;
             }
             average_degree = static_cast<SizeT>(mean);
         }
@@ -512,8 +484,7 @@ struct Csr
     /**
      * @brief Deallocates CSR graph
      */
-    void Free()
-    {
+    void Free() {
         if (row_offsets) {
             if (pinned) {
                 gunrock::util::GRError(cudaFreeHost(row_offsets),
@@ -544,8 +515,7 @@ struct Csr
     /**
      * @brief CSR destructor
      */
-    ~Csr()
-    {
+    ~Csr() {
         Free();
     }
 };
diff --git a/gunrock/graphio/market.cuh b/gunrock/graphio/market.cuh
index 494200f4b..74284f065 100644
--- a/gunrock/graphio/market.cuh
+++ b/gunrock/graphio/market.cuh
@@ -56,8 +56,7 @@ int ReadMarketStream(
     char *output_file,
     Csr<VertexId, Value, SizeT> &csr_graph,
     bool undirected,
-    bool reversed)
-{
+    bool reversed) {
     typedef Coo<VertexId, Value> EdgeTupleType;
 
     SizeT edges_read = -1;
@@ -73,7 +72,7 @@ int ReadMarketStream(
 
     bool ordered_rows = true;
 
-    while(true) {
+    while (true) {
 
         if (fscanf(f_in, "%[^\n]\n", line) <= 0) {
             break;
@@ -110,7 +109,7 @@ int ReadMarketStream(
             fflush(stdout);
 
             // Allocate coo graph
-            coo = (EdgeTupleType*) malloc(sizeof(EdgeTupleType) * edges);
+            coo = (EdgeTupleType*)malloc(sizeof(EdgeTupleType) * edges);
 
             edges_read++;
 
@@ -122,20 +121,21 @@ int ReadMarketStream(
                 return -1;
             }
             if (edges_read >= edges) {
-              fprintf(stderr,
-                      "Error parsing MARKET graph:"
-                      "encountered more than %d edges\n",
-                      edges);
-              if (coo) free(coo);
-              return -1;
+                fprintf(stderr,
+                        "Error parsing MARKET graph:"
+                        "encountered more than %d edges\n",
+                        edges);
+                if (coo) free(coo);
+                return -1;
             }
 
-            long long ll_row, ll_col, ll_value;
+            long long ll_row, ll_col;
+            Value ll_value;
             int num_input;
             if (LOAD_VALUES) {
                 if ((num_input = sscanf(
-                         line, "%lld %lld %lld",
-                         &ll_col, &ll_row, &ll_value)) < 2) {
+                                     line, "%lld %lld %d",
+                                     &ll_col, &ll_row, &ll_value)) < 2) {
                     fprintf(stderr,
                             "Error parsing MARKET graph: badly formed edge\n");
                     if (coo) free(coo);
@@ -205,7 +205,6 @@ int ReadMarketStream(
                                             undirected, reversed);
 
     free(coo);
-
     fflush(stdout);
 
     return 0;
@@ -220,8 +219,7 @@ int ReadCsrArrays(
     char *f_in,
     Csr<VertexId, Value, SizeT> &csr_graph,
     bool undirected,
-    bool reversed)
-{
+    bool reversed) {
     csr_graph.template FromCsr<LOAD_VALUES>(f_in, undirected, reversed);
     return 0;
 }
@@ -249,34 +247,30 @@ int BuildMarketGraph(
     char *output_file,
     Csr<VertexId, Value, SizeT> &csr_graph,
     bool undirected,
-    bool reversed)
-{
+    bool reversed) {
     FILE *_file = fopen(output_file, "r");
-    if (_file)
-    {
+    if (_file) {
         fclose(_file);
         if (ReadCsrArrays<LOAD_VALUES>(
-                output_file, csr_graph, undirected, reversed) != 0) {
+                    output_file, csr_graph, undirected, reversed) != 0) {
             return -1;
         }
-    }
-    else {
+    } else {
         if (mm_filename == NULL) {
             // Read from stdin
             printf("Reading from stdin:\n");
             if (ReadMarketStream<LOAD_VALUES>(
-                    stdin, output_file, csr_graph, undirected, reversed) != 0) {
+                        stdin, output_file, csr_graph, undirected, reversed) != 0) {
                 return -1;
             }
-        }
-        else {
+        } else {
             // Read from file
             FILE *f_in = fopen(mm_filename, "r");
             if (f_in) {
                 printf("Reading from %s:\n", mm_filename);
                 if (ReadMarketStream<LOAD_VALUES>(
-                        f_in, output_file, csr_graph,
-                        undirected, reversed) != 0) {
+                            f_in, output_file, csr_graph,
+                            undirected, reversed) != 0) {
                     fclose(f_in);
                     return -1;
                 }
@@ -299,37 +293,29 @@ int BuildMarketGraph(
     char *file_in,
     Csr<VertexId, Value, SizeT> &graph,
     bool undirected,
-    bool reversed)
-{
+    bool reversed) {
     // seperate the graph path and the file name
     char *temp1 = strdup(file_in);
     char *temp2 = strdup(file_in);
     char *file_path = dirname (temp1);
     char *file_name = basename(temp2);
 
-    if (undirected)
-    {
+    if (undirected) {
         char ud[256];
         sprintf(ud, "%s/.%s_undirected_csr", file_path, file_name);
         if (BuildMarketGraph<true>(file_in, ud, graph, true, false) != 0)
             return 1;
-    }
-    else if (!undirected && reversed)
-    {
+    } else if (!undirected && reversed) {
         char rv[256];
         sprintf(rv, "%s/.%s_reversed_csr", file_path, file_name);
         if (BuildMarketGraph<true>(file_in, rv, graph, false, true) != 0)
             return 1;
-    }
-    else if (!undirected && !reversed)
-    {
+    } else if (!undirected && !reversed) {
         char nr[256];
         sprintf(nr, "%s/.%s_nonreversed_csr", file_path, file_name);
         if (BuildMarketGraph<true>(file_in, nr, graph, false, false) != 0)
             return 1;
-    }
-    else
-    {
+    } else {
         fprintf(stderr, "Unspecified Graph Type.\n");
     }
     return 0;

From a42e70bf2204314e1397303c292ef2aaffd80f60 Mon Sep 17 00:00:00 2001
From: Yuduo Wu <yudwu@ucdavis.edu>
Date: Wed, 17 Jun 2015 11:14:40 -0700
Subject: [PATCH 12/36] small test mst sample dataset

---
 dataset/small/test_mst.mtx | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)
 create mode 100644 dataset/small/test_mst.mtx

diff --git a/dataset/small/test_mst.mtx b/dataset/small/test_mst.mtx
new file mode 100644
index 000000000..a6949843f
--- /dev/null
+++ b/dataset/small/test_mst.mtx
@@ -0,0 +1,18 @@
+9 9 17
+1 2 2
+2 3 2
+2 4 17
+3 1 2
+3 4 38
+3 5 10
+4 5 2
+5 1 82
+5 2 11
+6 3 100
+6 4 100
+6 5 210
+6 7 2
+6 8 21
+7 3 120
+7 5 110
+8 9 2
\ No newline at end of file

From c5ff9203f12dfadb27d776f73bf61a3125bce878 Mon Sep 17 00:00:00 2001
From: Yuduo Wu <yudwu@ucdavis.edu>
Date: Wed, 17 Jun 2015 12:02:29 -0700
Subject: [PATCH 13/36] changed some types to template rather than fixed uint

---
 gunrock/app/sssp/sssp_enactor.cuh |   7 +-
 gunrock/app/sssp/sssp_functor.cuh |  12 ++--
 gunrock/app/sssp/sssp_problem.cuh |   3 +-
 tests/sssp/test_sssp.cu           | 109 ++++++++++++++----------------
 4 files changed, 65 insertions(+), 66 deletions(-)

diff --git a/gunrock/app/sssp/sssp_enactor.cuh b/gunrock/app/sssp/sssp_enactor.cuh
index 7d9c5da43..855cbf9f6 100644
--- a/gunrock/app/sssp/sssp_enactor.cuh
+++ b/gunrock/app/sssp/sssp_enactor.cuh
@@ -207,16 +207,19 @@ class SSSPEnactor : public EnactorBase
     double                              queue_sizing,
     int                                 max_grid_size = 0)
     {
-        typedef typename SSSPProblem::SizeT      SizeT;
-        typedef typename SSSPProblem::VertexId   VertexId;
+        typedef typename SSSPProblem::VertexId VertexId;
+        typedef typename SSSPProblem::Value    Value;
+        typedef typename SSSPProblem::SizeT    SizeT;
 
         typedef SSSPFunctor<
             VertexId,
+            Value,
             SizeT,
             SSSPProblem> SsspFunctor;
 
         typedef PQFunctor<
             VertexId,
+            Value,
             SizeT,
             SSSPProblem> PqFunctor;
 
diff --git a/gunrock/app/sssp/sssp_functor.cuh b/gunrock/app/sssp/sssp_functor.cuh
index fa66e1dbd..37652a73f 100644
--- a/gunrock/app/sssp/sssp_functor.cuh
+++ b/gunrock/app/sssp/sssp_functor.cuh
@@ -32,7 +32,7 @@ namespace sssp {
  * @tparam ProblemData         Problem data type which contains data slice for SSSP problem
  *
  */
-template<typename VertexId, typename SizeT, typename ProblemData>
+template<typename VertexId, typename Value, typename SizeT, typename ProblemData>
 struct SSSPFunctor
 {
     typedef typename ProblemData::DataSlice DataSlice;
@@ -51,7 +51,7 @@ struct SSSPFunctor
      */
     static __device__ __forceinline__ bool CondEdge(VertexId s_id, VertexId d_id, DataSlice *problem, VertexId e_id = 0, VertexId e_id_in = 0)
     {
-        unsigned int label, weight;
+        Value label, weight;
 
         util::io::ModifiedLoad<ProblemData::COLUMN_READ_MODIFIER>::Ld(
                         label, problem->d_labels + s_id);
@@ -111,7 +111,7 @@ struct SSSPFunctor
     }
 };
 
-template<typename VertexId, typename SizeT, typename ProblemData>
+template<typename VertexId, typename Value, typename SizeT, typename ProblemData>
 struct PQFunctor
 {
     typedef typename ProblemData::DataSlice DataSlice;
@@ -126,15 +126,15 @@ struct PQFunctor
      *
      * \return Whether to load the apply function for the edge and include the destination node in the next frontier.
      */
-    static __device__ __forceinline__ unsigned int ComputePriorityScore(VertexId node_id, DataSlice *problem)
+    static __device__ __forceinline__ Value ComputePriorityScore(VertexId node_id, DataSlice *problem)
     {
-        unsigned int weight;
+        Value weight;
         util::io::ModifiedLoad<ProblemData::COLUMN_READ_MODIFIER>::Ld(
                         weight, problem->d_labels + node_id);
         float delta;
         util::io::ModifiedLoad<ProblemData::COLUMN_READ_MODIFIER>::Ld(
                         delta, problem->d_delta);
-        return (delta == 0) ? weight : weight/delta;
+        return (delta == 0) ? weight : weight / delta;
     }
 };
  
diff --git a/gunrock/app/sssp/sssp_problem.cuh b/gunrock/app/sssp/sssp_problem.cuh
index c3957caf5..01fb49738 100644
--- a/gunrock/app/sssp/sssp_problem.cuh
+++ b/gunrock/app/sssp/sssp_problem.cuh
@@ -14,6 +14,7 @@
 
 #pragma once
 
+#include <limits>
 #include <gunrock/app/problem_base.cuh>
 #include <gunrock/util/memset_kernel.cuh>
 
@@ -322,7 +323,7 @@ struct SSSPProblem : ProblemBase<_VertexId, _SizeT, false>
                 data_slices[gpu]->d_labels = d_labels;
             }
 
-            util::MemsetKernel<<<128, 128>>>(data_slices[gpu]->d_labels, UINT_MAX, nodes);
+            util::MemsetKernel<<<128, 128>>>(data_slices[gpu]->d_labels, std::numeric_limits<Value>::max(), nodes);
 
             if (!data_slices[gpu]->d_preds && MARK_PATHS) {
                 VertexId    *d_preds;
diff --git a/tests/sssp/test_sssp.cu b/tests/sssp/test_sssp.cu
index 71c319f68..b084842b6 100644
--- a/tests/sssp/test_sssp.cu
+++ b/tests/sssp/test_sssp.cu
@@ -155,7 +155,7 @@ template<
 void DisplayStats(
     Stats               &stats,
     VertexId            src,
-    unsigned int        *h_labels,
+    Value               *h_labels,
     const Csr<VertexId, Value, SizeT> &graph,
     double              elapsed,
     VertexId            search_depth,
@@ -238,25 +238,24 @@ template<
     typename SizeT,
     bool     MARK_PREDECESSORS>
 void SimpleReferenceSssp(
-    const Csr<VertexId, Value, SizeT>       &graph,
-    unsigned int                            *node_values,
-    unsigned int                            *node_preds,
-    VertexId                                src)
+    const Csr<VertexId, Value, SizeT> &graph,
+    Value                             *node_values,
+    VertexId                          *node_preds,
+    VertexId                          src)
 {
     using namespace boost;
 
     // Prepare Boost Datatype and Data structure
     typedef adjacency_list<vecS, vecS, directedS, no_property,
-                           property <edge_weight_t, unsigned int> > Graph;
+                           property <edge_weight_t, float> > Graph;
 
     typedef graph_traits<Graph>::vertex_descriptor vertex_descriptor;
     typedef graph_traits<Graph>::edge_descriptor edge_descriptor;
 
-    typedef std::pair<unsigned int, unsigned int> Edge;
+    typedef std::pair<VertexId, VertexId> Edge;
 
-    Edge* edges = (Edge*)malloc(sizeof(Edge)*graph.edges);
-    unsigned int *weight =
-        (unsigned int*)malloc(sizeof(unsigned int)*graph.edges);
+    Edge   *edges = ( Edge*)malloc(sizeof( Edge)*graph.edges);
+    Value *weight = (Value*)malloc(sizeof(Value)*graph.edges);
 
     for (int i = 0; i < graph.nodes; ++i)
     {
@@ -269,7 +268,7 @@ void SimpleReferenceSssp(
 
     Graph g(edges, edges + graph.edges, weight, graph.nodes);
 
-    std::vector<unsigned int> d(graph.nodes);
+    std::vector<Value> d(graph.nodes);
     std::vector<vertex_descriptor> p(graph.nodes);
     vertex_descriptor s = vertex(src, g);
 
@@ -282,28 +281,30 @@ void SimpleReferenceSssp(
     CpuTimer cpu_timer;
     cpu_timer.Start();
 
-    if (MARK_PREDECESSORS)
-        dijkstra_shortest_paths(
-            g, s,
-            predecessor_map(boost::make_iterator_property_map(p.begin(), get(boost::vertex_index, g))).
-            distance_map(boost::make_iterator_property_map(d.begin(), get(boost::vertex_index, g))));
-    else
-        dijkstra_shortest_paths(
-            g, s,
-            distance_map(boost::make_iterator_property_map(d.begin(), get(boost::vertex_index, g))));
+    if (MARK_PREDECESSORS) {
+        dijkstra_shortest_paths(g, s,
+            predecessor_map(boost::make_iterator_property_map(
+                    p.begin(), get(boost::vertex_index, g))).distance_map(
+                        boost::make_iterator_property_map(
+                            d.begin(), get(boost::vertex_index, g))));
+    } else {
+        dijkstra_shortest_paths(g, s,
+            distance_map(boost::make_iterator_property_map(
+                    d.begin(), get(boost::vertex_index, g))));
+    }
     cpu_timer.Stop();
     float elapsed = cpu_timer.ElapsedMillis();
 
     printf("CPU SSSP finished in %lf msec.\n", elapsed);
 
-    Coo<unsigned int, unsigned int>* sort_dist = NULL;
-    Coo<unsigned int, unsigned int>* sort_pred = NULL;
-    sort_dist = (Coo<unsigned int, unsigned int>*)malloc(
-        sizeof(Coo<unsigned int, unsigned int>) * graph.nodes);
-    if (MARK_PREDECESSORS)
-        sort_pred = (Coo<unsigned int, unsigned int>*)malloc(
-            sizeof(Coo<unsigned int, unsigned int>) * graph.nodes);
-
+    Coo<Value, Value>* sort_dist = NULL;
+    Coo<VertexId, VertexId>* sort_pred = NULL;
+    sort_dist = (Coo<Value, Value>*)malloc(
+        sizeof(Coo<Value, Value>) * graph.nodes);
+    if (MARK_PREDECESSORS) {
+        sort_pred = (Coo<VertexId, VertexId>*)malloc(
+            sizeof(Coo<VertexId, VertexId>) * graph.nodes);
+    }
     graph_traits < Graph >::vertex_iterator vi, vend;
     for (tie(vi, vend) = vertices(g); vi != vend; ++vi)
     {
@@ -312,7 +313,7 @@ void SimpleReferenceSssp(
     }
     std::stable_sort(
         sort_dist, sort_dist + graph.nodes,
-        RowFirstTupleCompare<Coo<unsigned int, unsigned int> >);
+        RowFirstTupleCompare<Coo<Value, Value> >);
 
     if (MARK_PREDECESSORS)
     {
@@ -323,21 +324,21 @@ void SimpleReferenceSssp(
         }
         std::stable_sort(
             sort_pred, sort_pred + graph.nodes,
-            RowFirstTupleCompare<Coo<unsigned int, unsigned int> >);
+            RowFirstTupleCompare< Coo<VertexId, VertexId> >);
     }
 
     for (int i = 0; i < graph.nodes; ++i)
     {
         node_values[i] = sort_dist[i].col;
     }
-    if (MARK_PREDECESSORS)
+    if (MARK_PREDECESSORS) {
         for (int i = 0; i < graph.nodes; ++i)
         {
             node_preds[i] = sort_pred[i].col;
         }
-
-    free(sort_dist);
-    if (MARK_PREDECESSORS) free(sort_pred);
+    }
+    if (sort_dist) free(sort_dist);
+    if (sort_pred) free(sort_pred);
 }
 
 /**
@@ -382,18 +383,17 @@ void RunTests(
         Value,
         MARK_PREDECESSORS> Problem;
 
-    // Allocate host-side label array (for both reference and gpu-computed results)
-    unsigned int    *reference_labels       = (unsigned int*)malloc(sizeof(unsigned int) * graph.nodes);
-    unsigned int    *h_labels               = (unsigned int*)malloc(sizeof(unsigned int) * graph.nodes);
-    unsigned int    *reference_check_label  = (g_quick) ? NULL : reference_labels;
-    unsigned int    *reference_preds        = NULL;
-    VertexId        *h_preds                = NULL;
-    unsigned int    *reference_check_pred   = NULL;
+    // Allocate host-side arrays (for both reference and gpu-computed results)
+    Value    *reference_labels = (Value*)malloc(sizeof(Value) * graph.nodes);
+    Value    *h_labels         = (Value*)malloc(sizeof(Value) * graph.nodes);
+    Value    *reference_check_label = (g_quick) ? NULL : reference_labels;
+    VertexId *reference_preds       = NULL;
+    VertexId *h_preds               = NULL;
+    VertexId *reference_check_pred  = NULL;
 
     if (MARK_PREDECESSORS)
     {
-        reference_preds =
-            (unsigned int*)malloc(sizeof(unsigned int) * graph.nodes);
+        reference_preds = (VertexId*)malloc(sizeof(VertexId) * graph.nodes);
         h_preds         = (VertexId*)malloc(sizeof(VertexId) * graph.nodes);
         reference_check_pred  = (g_quick) ? NULL : reference_preds;
     }
@@ -453,7 +453,6 @@ void RunTests(
     }
     elapsed /= iterations;
 
-
     sssp_enactor.GetStatistics(total_queued, search_depth, avg_duty);
 
     // Copy out results
@@ -499,13 +498,13 @@ void RunTests(
         avg_duty);
 
 
-    // Cleanup
-    delete stats;
-    if (csr_problem) delete csr_problem;
+    // Clean up
+    if (stats)            delete stats;
+    if (csr_problem)      delete csr_problem;
     if (reference_labels) free(reference_labels);
-    if (h_labels) free(h_labels);
-    if (reference_preds) free(reference_preds);
-    if (h_preds) free(h_preds);
+    if (h_labels)         free(h_labels);
+    if (reference_preds)  free(reference_preds);
+    if (h_preds)          free(h_preds);
 
     cudaDeviceSynchronize();
 }
@@ -540,7 +539,7 @@ void RunTests(
     int         iterations       = 1;   // Number of runs for testing
     int         delta_factor     = 16;  // Delta factor for priority queue
     int         traversal_mode   = -1;  // traversal mode: 0 for LB, 1 for TWC
-    g_quick                      = false;   // Whether or not to skip ref validation
+    g_quick                      = 0;   // Whether or not to skip ref validation
 
     // source vertex to start
     args.GetCmdLineArgument("src", src_str);
@@ -647,15 +646,10 @@ int main( int argc, char** argv)
         return 1;
     }
 
-    //DeviceInit(args);
-    //cudaSetDeviceFlags(cudaDeviceMapHost);
     int dev = 0;
     args.GetCmdLineArgument("device", dev);
     ContextPtr context = mgpu::CreateCudaDevice(dev);
 
-    //srand(0); // Presently deterministic
-    //srand(time(NULL));
-
     // Parse graph-contruction params
     g_undirected = args.CheckCmdLineFlag("undirected");
     std::string graph_type = argv[1];
@@ -678,6 +672,7 @@ int main( int argc, char** argv)
         typedef int VertexId;                   // Use as the node identifier
         typedef unsigned int Value;             // Use as the value type
         typedef int SizeT;                      // Use as the graph size type
+
         Csr<VertexId, Value, SizeT> csr(false); // default for stream_from_host
 
         if (graph_args < 1) { Usage(); return 1; }
@@ -692,7 +687,7 @@ int main( int argc, char** argv)
         }
 
         csr.PrintHistogram();
-        //csr.DisplayGraph(true); //print graph with edge_value
+        csr.DisplayGraph(true); //print graph with edge_value
         //csr.GetAverageEdgeValue();
         //csr.GetAverageDegree();
         //int max_degree;

From fa477a3192e41574893bc46958a083bf929a320c Mon Sep 17 00:00:00 2001
From: Yuduo Wu <yudwu@ucdavis.edu>
Date: Wed, 17 Jun 2015 13:40:32 -0700
Subject: [PATCH 14/36] clean up

---
 gunrock/app/mst/mst_enactor.cuh | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/gunrock/app/mst/mst_enactor.cuh b/gunrock/app/mst/mst_enactor.cuh
index 4908b71b9..0c44b617c 100644
--- a/gunrock/app/mst/mst_enactor.cuh
+++ b/gunrock/app/mst/mst_enactor.cuh
@@ -215,10 +215,8 @@ public:
     typedef SuRmFunctor <VertexId, SizeT, VertexId, MSTProblem> SuRmFunctor;
     typedef EIdxFunctor <VertexId, SizeT, VertexId, MSTProblem> EIdxFunctor;
     typedef MarkFunctor <VertexId, SizeT, VertexId, MSTProblem> MarkFunctor;
-    //typedef OrFunctor   <VertexId, SizeT, VertexId, MSTProblem> OrFunctor;
 
     cudaError_t retval = cudaSuccess;
-
     unsigned int *d_scanned_edges = NULL;
 
     do
@@ -273,7 +271,7 @@ public:
         }
 
         // generate d_flags_array from d_row_offsets using MarkSegment kernel
-        util::MarkSegmentFromIndices<<<128, 128>>>(
+        util::MarkSegmentFromIndices<bool><<<128, 128>>>(
           problem->data_slices[0]->d_flags_array,
           graph_slice->d_row_offsets, graph_slice->nodes);
 
@@ -584,7 +582,7 @@ public:
 
         ////////////////////////////////////////////////////////////////////////
         // create a flag to mark the boundaries of representative vertices
-        util::MarkSegmentFromKeys<<<128, 128>>>(
+        util::MarkSegmentFromKeys<bool><<<128, 128>>>(
           problem->data_slices[0]->d_flags_array,
           problem->data_slices[0]->d_supervtx_ids,
           graph_slice->nodes);
@@ -790,8 +788,7 @@ public:
           graph_slice->edges);
 
         util::MemsetCopyVectorKernel<<<128, 128>>>(
-          //problem->data_slices[0]->d_temp_value,
-            problem->data_slices[0]->d_super_edges,  // used as temp_index
+          problem->data_slices[0]->d_super_edges,  // used as temp_index
           problem->data_slices[0]->d_keys_array,
           graph_slice->edges);
 
@@ -807,7 +804,6 @@ public:
 
         util::CUBRadixSort<VertexId, VertexId>(
           true, graph_slice->edges,
-          //problem->data_slices[0]->d_temp_value,
           problem->data_slices[0]->d_super_edges,  // used as temp_index
           problem->data_slices[0]->d_origin_edges);
 

From 0c375d00e61e67b44d2598d72801405fa6a2a493 Mon Sep 17 00:00:00 2001
From: Yuduo Wu <yudwu@ucdavis.edu>
Date: Wed, 17 Jun 2015 13:43:31 -0700
Subject: [PATCH 15/36] blah

---
 gunrock/app/mst/mst_enactor.cuh | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/gunrock/app/mst/mst_enactor.cuh b/gunrock/app/mst/mst_enactor.cuh
index 4908b71b9..0c44b617c 100644
--- a/gunrock/app/mst/mst_enactor.cuh
+++ b/gunrock/app/mst/mst_enactor.cuh
@@ -215,10 +215,8 @@ public:
     typedef SuRmFunctor <VertexId, SizeT, VertexId, MSTProblem> SuRmFunctor;
     typedef EIdxFunctor <VertexId, SizeT, VertexId, MSTProblem> EIdxFunctor;
     typedef MarkFunctor <VertexId, SizeT, VertexId, MSTProblem> MarkFunctor;
-    //typedef OrFunctor   <VertexId, SizeT, VertexId, MSTProblem> OrFunctor;
 
     cudaError_t retval = cudaSuccess;
-
     unsigned int *d_scanned_edges = NULL;
 
     do
@@ -273,7 +271,7 @@ public:
         }
 
         // generate d_flags_array from d_row_offsets using MarkSegment kernel
-        util::MarkSegmentFromIndices<<<128, 128>>>(
+        util::MarkSegmentFromIndices<bool><<<128, 128>>>(
           problem->data_slices[0]->d_flags_array,
           graph_slice->d_row_offsets, graph_slice->nodes);
 
@@ -584,7 +582,7 @@ public:
 
         ////////////////////////////////////////////////////////////////////////
         // create a flag to mark the boundaries of representative vertices
-        util::MarkSegmentFromKeys<<<128, 128>>>(
+        util::MarkSegmentFromKeys<bool><<<128, 128>>>(
           problem->data_slices[0]->d_flags_array,
           problem->data_slices[0]->d_supervtx_ids,
           graph_slice->nodes);
@@ -790,8 +788,7 @@ public:
           graph_slice->edges);
 
         util::MemsetCopyVectorKernel<<<128, 128>>>(
-          //problem->data_slices[0]->d_temp_value,
-            problem->data_slices[0]->d_super_edges,  // used as temp_index
+          problem->data_slices[0]->d_super_edges,  // used as temp_index
           problem->data_slices[0]->d_keys_array,
           graph_slice->edges);
 
@@ -807,7 +804,6 @@ public:
 
         util::CUBRadixSort<VertexId, VertexId>(
           true, graph_slice->edges,
-          //problem->data_slices[0]->d_temp_value,
           problem->data_slices[0]->d_super_edges,  // used as temp_index
           problem->data_slices[0]->d_origin_edges);
 

From 9e78f8bf51ae3a63f0922496e71ecdf56aa6ae37 Mon Sep 17 00:00:00 2001
From: Yuduo Wu <yudwu@ucdavis.edu>
Date: Wed, 17 Jun 2015 13:45:18 -0700
Subject: [PATCH 16/36] clean up

---
 gunrock/app/mst/mst_enactor.cuh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/gunrock/app/mst/mst_enactor.cuh b/gunrock/app/mst/mst_enactor.cuh
index 0c44b617c..fa8dac343 100644
--- a/gunrock/app/mst/mst_enactor.cuh
+++ b/gunrock/app/mst/mst_enactor.cuh
@@ -271,7 +271,7 @@ public:
         }
 
         // generate d_flags_array from d_row_offsets using MarkSegment kernel
-        util::MarkSegmentFromIndices<bool><<<128, 128>>>(
+        util::MarkSegmentFromIndices<<<128, 128>>>(
           problem->data_slices[0]->d_flags_array,
           graph_slice->d_row_offsets, graph_slice->nodes);
 
@@ -582,7 +582,7 @@ public:
 
         ////////////////////////////////////////////////////////////////////////
         // create a flag to mark the boundaries of representative vertices
-        util::MarkSegmentFromKeys<bool><<<128, 128>>>(
+        util::MarkSegmentFromKeys<<<128, 128>>>(
           problem->data_slices[0]->d_flags_array,
           problem->data_slices[0]->d_supervtx_ids,
           graph_slice->nodes);

From 127eba3d41a03632a7d3bb1f1c48c00bb61a9c78 Mon Sep 17 00:00:00 2001
From: Yuduo Wu <yudwu@ucdavis.edu>
Date: Thu, 18 Jun 2015 08:29:37 -0700
Subject: [PATCH 17/36] Vertex-Induced Subgraph primitive

---
 gunrock/app/vis/vis_enactor.cuh | 395 ++++++++++++++++++++++++++++++++
 gunrock/app/vis/vis_functor.cuh | 108 +++++++++
 gunrock/app/vis/vis_problem.cuh | 294 ++++++++++++++++++++++++
 tests/vis/Makefile              | 108 +++++++++
 tests/vis/run.sh                |  28 +++
 tests/vis/test_vis.cu           | 346 ++++++++++++++++++++++++++++
 6 files changed, 1279 insertions(+)
 create mode 100644 gunrock/app/vis/vis_enactor.cuh
 create mode 100644 gunrock/app/vis/vis_functor.cuh
 create mode 100644 gunrock/app/vis/vis_problem.cuh
 create mode 100644 tests/vis/Makefile
 create mode 100644 tests/vis/run.sh
 create mode 100644 tests/vis/test_vis.cu

diff --git a/gunrock/app/vis/vis_enactor.cuh b/gunrock/app/vis/vis_enactor.cuh
new file mode 100644
index 000000000..590863cb6
--- /dev/null
+++ b/gunrock/app/vis/vis_enactor.cuh
@@ -0,0 +1,395 @@
+// ----------------------------------------------------------------------------
+// Gunrock -- High-Performance Graph Primitives on GPU
+// ----------------------------------------------------------------------------
+// This source code is distributed under the terms of LICENSE.TXT
+// in the root directory of this source distribution.
+// ----------------------------------------------------------------------------
+
+/**
+ * @file vis_enactor.cuh
+ * @brief Primitive problem enactor for Vertex-Induced Subgraph
+ */
+
+#pragma once
+
+#include <gunrock/util/kernel_runtime_stats.cuh>
+#include <gunrock/util/test_utils.cuh>
+
+#include <gunrock/oprtr/advance/kernel.cuh>
+#include <gunrock/oprtr/advance/kernel_policy.cuh>
+#include <gunrock/oprtr/filter/kernel.cuh>
+#include <gunrock/oprtr/filter/kernel_policy.cuh>
+
+#include <gunrock/app/enactor_base.cuh>
+#include <gunrock/app/vis/vis_problem.cuh>
+#include <gunrock/app/vis/vis_functor.cuh>
+
+namespace gunrock {
+namespace app {
+namespace vis {
+
+/**
+ * @brief Primitive enactor class.
+ * @tparam INSTRUMWENT Boolean indicate collect per-CTA clock-count statistics
+ */
+template<bool INSTRUMENT>
+class VISEnactor : public EnactorBase {
+ protected:
+    /**
+     * A pinned, mapped word that the traversal kernels will signal when done
+     */
+    volatile int *done;
+    int          *d_done;
+    cudaEvent_t  throttle_event;
+
+    /**
+     * @brief Prepare the enactor for kernel call.
+     * @param[in] problem Problem object holds both graph and primitive data.
+     * \return cudaError_t object indicates the success of all CUDA functions.
+     */
+    template <typename ProblemData>
+    cudaError_t Setup(ProblemData *problem) {
+        typedef typename ProblemData::SizeT    SizeT;
+        typedef typename ProblemData::VertexId VertexId;
+
+        cudaError_t retval = cudaSuccess;
+
+        // initialize the host-mapped "done"
+        if (!done) {
+            int flags = cudaHostAllocMapped;
+
+            // allocate pinned memory for done
+            if (retval = util::GRError(
+                    cudaHostAlloc((void**)&done, sizeof(int) * 1, flags),
+                    "Enactor cudaHostAlloc done failed",
+                    __FILE__, __LINE__)) return retval;
+
+            // map done into GPU space
+            if (retval = util::GRError(
+                    cudaHostGetDevicePointer((void**)&d_done, (void*) done, 0),
+                    "Enactor cudaHostGetDevicePointer done failed",
+                    __FILE__, __LINE__)) return retval;
+
+            // create throttle event
+            if (retval = util::GRError(
+                    cudaEventCreateWithFlags(&throttle_event, cudaEventDisableTiming),
+                    "Enactor cudaEventCreateWithFlags throttle_event failed",
+                    __FILE__, __LINE__)) return retval;
+        }
+
+        done[0] = -1;
+
+        // graph slice
+        typename ProblemData::GraphSlice *graph_slice = problem->graph_slices[0];
+        // TODO: uncomment if using data_slice to store primitive-specific array
+        //typename ProblemData::DataSlice *data_slice = problem->data_slices[0];
+
+        do {
+            // bind row-offsets and bit-mask texture
+            cudaChannelFormatDesc row_offsets_desc = cudaCreateChannelDesc<SizeT>();
+            oprtr::edge_map_forward::RowOffsetTex<SizeT>::ref.channelDesc = row_offsets_desc;
+            if (retval = util::GRError(
+                    cudaBindTexture(
+                        0,
+                        oprtr::edge_map_forward::RowOffsetTex<SizeT>::ref,
+                        graph_slice->d_row_offsets,
+                        (graph_slice->nodes + 1) * sizeof(SizeT)),
+                    "Enactor cudaBindTexture row_offset_tex_ref failed",
+                    __FILE__, __LINE__)) break;
+        } while (0);
+        return retval;
+    }
+
+ public:
+    /**
+     * @brief Constructor
+     */
+    explicit VISEnactor(bool DEBUG = false) :
+        EnactorBase(EDGE_FRONTIERS, DEBUG), done(NULL), d_done(NULL) {}
+
+    /**
+     * @brief Destructor
+     */
+    virtual ~VISEnactor() {
+        if (done) {
+            util::GRError(cudaFreeHost((void*)done),
+                "Enactor FreeHost done failed", __FILE__, __LINE__);
+            util::GRError(cudaEventDestroy(throttle_event),
+                "Enactor Destroy throttle_event failed", __FILE__, __LINE__);
+        }
+    }
+
+    /**
+     * \addtogroup PublicInterface
+     * @{
+     */
+
+    /**
+     * @brief Obtain statistics the primitive enacted.
+     * @param[out] num_iterations Number of iterations (BSP super-steps).
+     */
+    template <typename VertexId>
+    void GetStatistics(VertexId &num_iterations) {
+        cudaThreadSynchronize();
+        num_iterations = enactor_stats.iteration;
+    }
+
+    /** @} */
+
+    /**
+     * @brief Enacts computing on the specified graph.
+     *
+     * @tparam AdvanceKernelPolicy Kernel policy for advance operator.
+     * @tparam FilterKernelPolicy Kernel policy for filter operator.
+     * @tparam Problem Problem type.
+     *
+     * @param[in] context CudaContext pointer for ModernGPU APIs
+     * @param[in] problem Problem object.
+     * @param[in] max_grid_size Max grid size for kernel calls.
+     *
+     * \return cudaError_t object indicates the success of all CUDA functions.
+     */
+    template <
+        typename AdvanceKernelPolicy,
+        typename FilterKernelPolicy,
+        typename Problem >
+    cudaError_t EnactVIS(
+        CudaContext & context,
+        Problem     * problem,
+        int         max_grid_size = 0) {
+        typedef typename Problem::VertexId VertexId;
+        typedef typename Problem::Value    Value;
+        typedef typename Problem::SizeT    SizeT;
+
+        typedef VISFunctor<VertexId, Value, SizeT, Problem> Functor;
+
+        cudaError_t retval = cudaSuccess;
+
+        do {
+            unsigned int *d_scanned_edges = NULL;
+
+            fflush(stdout);
+
+            // lazy initialization
+            if (retval = Setup(problem)) break;
+
+            if (retval = EnactorBase::Setup(
+                    max_grid_size,
+                    AdvanceKernelPolicy::CTA_OCCUPANCY,
+                    FilterKernelPolicy::CTA_OCCUPANCY))
+                break;
+
+            // single-gpu graph slice and data slice
+            typename Problem::GraphSlice *g_slice = problem->graph_slices[0];
+            typename Problem::DataSlice *d_slice = problem->d_data_slices[0];
+
+            if (AdvanceKernelPolicy::ADVANCE_MODE == oprtr::advance::LB) {
+                if (retval = util::GRError(
+                        cudaMalloc((void**)&d_scanned_edges,
+                        g_slice->edges * sizeof(unsigned int)),
+                        "VISProblem cudaMalloc d_scanned_edges failed",
+                        __FILE__, __LINE__)) return retval;
+            }
+
+            frontier_attribute.queue_length = g_slice->nodes;
+            frontier_attribute.queue_index  = 0;  // work queue index
+            frontier_attribute.selector     = 0;
+            frontier_attribute.queue_reset  = true;
+
+            // filter: intput all vertices in graph, output selected vertices
+            oprtr::filter::Kernel<FilterKernelPolicy, Problem, Functor>
+                <<<enactor_stats.filter_grid_size, FilterKernelPolicy::THREADS>>>(
+                enactor_stats.iteration + 1,
+                frontier_attribute.queue_reset,
+                frontier_attribute.queue_index,
+                enactor_stats.num_gpus,
+                frontier_attribute.queue_length,
+                d_done,
+                g_slice->frontier_queues.d_keys[frontier_attribute.selector],
+                NULL,
+                g_slice->frontier_queues.d_keys[frontier_attribute.selector^1],
+                d_slice,
+                NULL,
+                work_progress,
+                g_slice->frontier_elements[frontier_attribute.selector],
+                g_slice->frontier_elements[frontier_attribute.selector^1],
+                enactor_stats.filter_kernel_stats);
+
+            if (DEBUG && (retval = util::GRError(cudaThreadSynchronize(),
+                "filter::Kernel failed", __FILE__, __LINE__))) break;
+            cudaEventQuery(throttle_event);
+
+            frontier_attribute.queue_index++;
+            frontier_attribute.selector ^= 1;
+
+            if (retval = work_progress.GetQueueLength(
+                    frontier_attribute.queue_index,
+                    frontier_attribute.queue_length)) break;
+            if (DEBUG) {
+                printf("filter queue length: %lld",
+                       (long long) frontier_attribute.queue_length);
+                util::DisplayDeviceResults(
+                    problem->data_slices[0]->d_bitmask, g_slice->nodes);
+                printf("input queue for advance:\n");
+                util::DisplayDeviceResults(
+                    g_slice->frontier_queues.d_keys[frontier_attribute.selector],
+                    frontier_attribute.queue_length);
+            }
+
+        oprtr::advance::LaunchKernel<AdvanceKernelPolicy, Problem, Functor>(
+            NULL,
+            enactor_stats,
+            frontier_attribute,
+            d_slice,
+            (VertexId*)NULL,
+            (bool*)NULL,
+            (bool*)NULL,
+            d_scanned_edges,
+            g_slice->frontier_queues.d_keys[frontier_attribute.selector],
+            g_slice->frontier_queues.d_keys[frontier_attribute.selector^1],
+            (VertexId*)NULL,
+            (VertexId*)NULL,
+            g_slice->d_row_offsets,
+            g_slice->d_column_indices,
+            (SizeT*)NULL,
+            (VertexId*)NULL,
+            g_slice->nodes,
+            g_slice->edges,
+            this->work_progress,
+            context,
+            gunrock::oprtr::advance::V2V);
+
+        if (DEBUG && (retval = util::GRError(cudaThreadSynchronize(),
+            "advance::Kernel failed", __FILE__, __LINE__))) break;
+        cudaEventQuery(throttle_event);
+
+        frontier_attribute.queue_index++;
+
+        if (DEBUG) {
+            if (retval = work_progress.GetQueueLength(
+                    frontier_attribute.queue_index,
+                    frontier_attribute.queue_length)) break;
+            printf("advance queue length: %lld",
+                   (long long) frontier_attribute.queue_length);
+            util::DisplayDeviceResults(
+                    g_slice->frontier_queues.d_keys[frontier_attribute.selector^1],
+                    frontier_attribute.queue_length);
+        }
+
+        // TODO: extract graph with proper format (edge list, csr, etc.)
+
+        if (d_scanned_edges) cudaFree(d_scanned_edges);
+
+        } while (0);
+
+        if (DEBUG) {
+            printf("\nGPU Vertex-Induced Subgraph Enact Done.\n");
+        }
+
+        return retval;
+    }
+
+    /**
+     * \addtogroup PublicInterface
+     * @{
+     */
+
+    /**
+     * @brief Primitive enact kernel entry.
+     *
+     * @tparam Problem Problem type. @see Problem
+     *
+     * @param[in] context CudaContext pointer for ModernGPU APIs
+     * @param[in] problem Pointer to Problem object.
+     * @param[in] max_grid_size Max grid size for kernel calls.
+     * @param[in] traversal_mode Traversal Mode for advance operator:
+     *            Load-balanced or Dynamic cooperative
+     *
+     * \return cudaError_t object indicates the success of all CUDA functions.
+     */
+    template <typename Problem>
+    cudaError_t Enact(
+        CudaContext &context,
+        Problem     *problem,
+        int         max_grid_size  = 0,
+        int         traversal_mode = 0) {
+        if (this->cuda_props.device_sm_version >= 300) {
+            typedef oprtr::filter::KernelPolicy <
+                Problem,             // Problem data type
+                300,                 // CUDA_ARCH
+                INSTRUMENT,          // INSTRUMENT
+                0,                   // SATURATION QUIT
+                true,                // DEQUEUE_PROBLEM_SIZE
+                8,                   // MIN_CTA_OCCUPANCY
+                8,                   // LOG_THREADS
+                1,                   // LOG_LOAD_VEC_SIZE
+                0,                   // LOG_LOADS_PER_TILE
+                5,                   // LOG_RAKING_THREADS
+                5,                   // END_BITMASK_CULL
+                8 >                  // LOG_SCHEDULE_GRANULARITY
+                FilterKernelPolicy;
+
+            typedef oprtr::advance::KernelPolicy <
+                Problem,             // Problem data type
+                300,                 // CUDA_ARCH
+                INSTRUMENT,          // INSTRUMENT
+                1,                   // MIN_CTA_OCCUPANCY
+                7,                   // LOG_THREADS
+                8,                   // LOG_BLOCKS
+                32 * 128,            // LIGHT_EDGE_THRESHOLD (used for LB)
+                1,                   // LOG_LOAD_VEC_SIZE
+                0,                   // LOG_LOADS_PER_TILE
+                5,                   // LOG_RAKING_THREADS
+                32,                  // WARP_GATHER_THRESHOLD
+                128 * 4,             // CTA_GATHER_THRESHOLD
+                7,                   // LOG_SCHEDULE_GRANULARITY
+                oprtr::advance::TWC_FORWARD >
+                ForwardAdvanceKernelPolicy;
+
+            typedef oprtr::advance::KernelPolicy <
+                Problem,             // Problem data type
+                300,                 // CUDA_ARCH
+                INSTRUMENT,          // INSTRUMENT
+                1,                   // MIN_CTA_OCCUPANCY
+                10,                  // LOG_THREADS
+                8,                   // LOG_BLOCKS
+                32 * 128,            // LIGHT_EDGE_THRESHOLD (used for LB)
+                1,                   // LOG_LOAD_VEC_SIZE
+                0,                   // LOG_LOADS_PER_TILE
+                5,                   // LOG_RAKING_THREADS
+                32,                  // WARP_GATHER_THRESHOLD
+                128 * 4,             // CTA_GATHER_THRESHOLD
+                7,                   // LOG_SCHEDULE_GRANULARITY
+                oprtr::advance::LB >
+                LBAdvanceKernelPolicy;
+
+            if (traversal_mode == 0) {
+                return EnactVIS<
+                    LBAdvanceKernelPolicy, FilterKernelPolicy, Problem>(
+                        context, problem, max_grid_size);
+            } else {  // traversal_mode == 1
+                return EnactVIS<
+                    ForwardAdvanceKernelPolicy, FilterKernelPolicy, Problem>(
+                        context, problem, max_grid_size);
+            }
+        }
+
+        // to reduce compile time, get rid of other architecture for now
+        // TODO: add all the kernel policy setting for all architectures
+
+        printf("Not yet tuned for this architecture\n");
+        return cudaErrorInvalidDeviceFunction;
+    }
+
+    /** @} */
+};
+
+}  // namespace vis
+}  // namespace app
+}  // namespace gunrock
+
+// Leave this at the end of the file
+// Local Variables:
+// mode:c++
+// c-file-style: "NVIDIA"
+// End:
diff --git a/gunrock/app/vis/vis_functor.cuh b/gunrock/app/vis/vis_functor.cuh
new file mode 100644
index 000000000..7611d42d0
--- /dev/null
+++ b/gunrock/app/vis/vis_functor.cuh
@@ -0,0 +1,108 @@
+// ----------------------------------------------------------------------------
+// Gunrock -- High-Performance Graph Primitives on GPU
+// ----------------------------------------------------------------------------
+// This source code is distributed under the terms of LICENSE.TXT
+// in the root directory of this source distribution.
+// ----------------------------------------------------------------------------
+
+/**
+ * @file vis_functor.cuh
+ * @brief Device functions for Vertex-Induced Subgraph
+ */
+
+#pragma once
+
+#include <gunrock/app/problem_base.cuh>
+#include <gunrock/app/vis/vis_problem.cuh>
+
+namespace gunrock {
+namespace app {
+namespace vis {
+
+/**
+ * @brief Structure contains device functions
+ *
+ * @tparam VertexId    Type used for vertex id (e.g., uint32)
+ * @tparam SizeT       Type used for array indexing. (e.g., uint32)
+ * @tparam Value       Type used for calculation values (e.g., float)
+ * @tparam ProblemData Problem data type which contains data slice
+ *
+ */
+template<typename VertexId, typename SizeT,
+         typename Value, typename ProblemData>
+struct VISFunctor {
+    typedef typename ProblemData::DataSlice DataSlice;
+
+    /**
+     * @brief Advance condition function
+     *
+     * @param[in] s_id Vertex Id of the edge source node
+     * @param[in] d_id Vertex Id of the edge destination node
+     * @param[in] problem Data slice object
+     * @param[in] e_id Output edge id
+     * @param[in] e_id_in Input edge id
+     *
+     * \return Whether to load the apply function for the edge and
+     *         include the destination node in the next frontier.
+     */
+    static __device__ __forceinline__ bool
+    CondEdge(VertexId s_id, VertexId d_id, DataSlice *problem,
+             VertexId e_id = 0, VertexId e_id_in = 0) {
+        return problem->d_bitmask[d_id];
+    }
+
+    /**
+     * @brief Advance apply function
+     *
+     * @param[in] s_id Vertex Id of the edge source node
+     * @param[in] d_id Vertex Id of the edge destination node
+     * @param[in] problem Data slice object
+     * @param[in] e_id Output edge id
+     * @param[in] e_id_in Input edge id
+     *
+     */
+    static __device__ __forceinline__ void
+    ApplyEdge(VertexId s_id, VertexId d_id, DataSlice *problem,
+              VertexId e_id = 0, VertexId e_id_in = 0) {
+        printf("select edges: sid: %d, did: %d, eid: %d\n", s_id, d_id, e_id);
+    }
+
+    /**
+     * @brief filter condition function
+     *
+     * @param[in] node Vertex Id
+     * @param[in] problem Data slice object
+     * @param[in] v Auxiliary value
+     *
+     * \return Whether to load the apply function for the node and
+     *         include it in the outgoing vertex frontier.
+     */
+    static __device__ __forceinline__ bool
+    CondFilter(VertexId node, DataSlice *problem, Value v = 0, SizeT nid = 0) {
+        return (node % 2) == 0;  // TODO: USER-DEFINED FILTER CONDITION HERE
+    }
+
+    /**
+     * @brief filter apply function
+     *
+     * @param[in] node Vertex Id
+     * @param[in] problem Data slice object
+     * @param[in] v Auxiliary value
+     *
+     */
+    static __device__ __forceinline__ void
+    ApplyFilter(VertexId node, DataSlice *problem, Value v = 0, SizeT nid = 0) {
+        util::io::ModifiedStore<ProblemData::QUEUE_WRITE_MODIFIER>::St(
+            true, problem->d_bitmask + node);
+    }
+};
+
+}  // namespace vis
+}  // namespace app
+}  // namespace gunrock
+
+// Leave this at the end of the file
+// Local Variables:
+// mode:c++
+// c-file-style: "NVIDIA"
+// End:
diff --git a/gunrock/app/vis/vis_problem.cuh b/gunrock/app/vis/vis_problem.cuh
new file mode 100644
index 000000000..85519391b
--- /dev/null
+++ b/gunrock/app/vis/vis_problem.cuh
@@ -0,0 +1,294 @@
+// ----------------------------------------------------------------------------
+// Gunrock -- High-Performance Graph Primitives on GPU
+// ----------------------------------------------------------------------------
+// This source code is distributed under the terms of LICENSE.TXT
+// in the root directory of this source distribution.
+// ----------------------------------------------------------------------------
+
+/**
+ * @file vis_problem.cuh
+ * @brief GPU storage management structure for Vertex-Induced Subgraph
+ */
+
+#pragma once
+
+#include <gunrock/app/problem_base.cuh>
+#include <gunrock/util/memset_kernel.cuh>
+
+namespace gunrock {
+namespace app {
+namespace vis {
+
+/**
+ * @brief Problem structure stores device-side vectors
+ * @tparam _VertexId Type use as vertex id (e.g., uint32)
+ * @tparam _SizeT    Type use for array indexing. (e.g., uint32)
+ * @tparam _Value    Type use for computed value.
+ */
+template<typename _VertexId, typename _SizeT, typename _Value>
+struct VISProblem : ProblemBase<_VertexId, _SizeT, false> {
+    typedef _VertexId VertexId;
+    typedef _SizeT    SizeT;
+    typedef _Value    Value;
+
+    static const bool MARK_PREDECESSORS  = true;
+    static const bool ENABLE_IDEMPOTENCE = false;
+
+    /**
+     * @brief Data slice structure which contains problem specific data.
+     */
+    struct DataSlice {
+        // device storage arrays
+        VertexId *d_labels;   // used for ...
+        bool     *d_bitmask;  // used for indicating if vertex is in subgraph
+    };
+
+    int       num_gpus;
+    SizeT     nodes;
+    SizeT     edges;
+
+    // data slices (one for each GPU)
+    DataSlice **data_slices;
+
+    // putting structure on device while keeping the SoA structure
+    DataSlice **d_data_slices;
+
+    // device index for each data slice
+    int       *gpu_idx;
+
+    /**
+     * @brief Default constructor
+     */
+    VISProblem(): nodes(0), edges(0), num_gpus(0) {}
+
+    /**
+     * @brief Constructor
+     * @param[in] stream_from_host Whether to stream data from host.
+     * @param[in] graph Reference to the CSR graph object we process on.
+     * @param[in] num_gpus Number of the GPUs used.
+     */
+    VISProblem(bool  stream_from_host,  // only meaningful for single-GPU
+                  const Csr<VertexId, Value, SizeT> &graph,
+                  int   num_gpus) :
+        num_gpus(num_gpus) {
+        Init(stream_from_host, graph, num_gpus);
+    }
+
+    /**
+     * @brief Default destructor
+     */
+    ~VISProblem() {
+        for (int i = 0; i < num_gpus; ++i) {
+            if (util::GRError(
+                cudaSetDevice(gpu_idx[i]),
+                "~Problem cudaSetDevice failed", __FILE__, __LINE__)) break;
+
+            if (data_slices[i]->d_labels)
+                util::GRError(cudaFree(data_slices[i]->d_labels),
+                    "GpuSlice cudaFree d_labels failed", __FILE__, __LINE__);
+
+            if (data_slices[i]->d_bitmask)
+                util::GRError(cudaFree(data_slices[i]->d_bitmask),
+                    "DataSlice cudaFree d_bitmask failed", __FILE__, __LINE__);
+
+            if (d_data_slices[i])
+                util::GRError(cudaFree(d_data_slices[i]),
+                    "GpuSlice cudaFree data_slices failed", __FILE__, __LINE__);
+        }
+        if (d_data_slices) delete[] d_data_slices;
+        if (data_slices)   delete[]   data_slices;
+    }
+
+    /**
+     * \addtogroup PublicInterface
+     * @{
+     */
+
+    /**
+     * @brief Copy results computed on the GPU back to host-side vectors.
+     * @param[out] h_labels
+     *\return cudaError_t object indicates the success of all CUDA functions.
+     */
+    cudaError_t Extract(VertexId *h_labels) {
+        cudaError_t retval = cudaSuccess;
+
+        do {
+            if (num_gpus == 1) {
+                if (util::GRError(cudaSetDevice(gpu_idx[0]),
+                                  "Problem cudaSetDevice failed",
+                                  __FILE__, __LINE__)) break;
+
+                if (retval = util::GRError(
+                        cudaMemcpy(h_labels,
+                                   data_slices[0]->d_labels,
+                                   sizeof(VertexId) * nodes,
+                                   cudaMemcpyDeviceToHost),
+                        "Problem cudaMemcpy d_labels failed",
+                        __FILE__, __LINE__)) break;
+
+                // TODO: code to extract other results here
+
+            } else {
+                // multi-GPU extension code
+            }
+        } while (0);
+
+        return retval;
+    }
+
+    /**
+     * @brief Problem initialization
+     *
+     * @param[in] stream_from_host Whether to stream data from host.
+     * @param[in] graph Reference to the CSR graph object we process on.
+     * @param[in] _num_gpus Number of the GPUs used.
+     *
+     * \return cudaError_t object indicates the success of all CUDA functions.
+     */
+    cudaError_t Init(
+        bool  stream_from_host,  // only meaningful for single-GPU
+        const Csr<VertexId, Value, SizeT> &graph,
+        int   _num_gpus) {
+        num_gpus = _num_gpus;
+        nodes    = graph.nodes;
+        edges    = graph.edges;
+        VertexId *h_row_offsets    = graph.row_offsets;
+        VertexId *h_column_indices = graph.column_indices;
+
+        ProblemBase<_VertexId, _SizeT, false>::Init(
+            stream_from_host,
+            nodes,
+            edges,
+            h_row_offsets,
+            h_column_indices,
+            NULL,
+            NULL,
+            num_gpus);
+
+        // no data in DataSlice needs to be copied from host
+
+        /**
+         * Allocate output labels
+         */
+        cudaError_t retval = cudaSuccess;
+        data_slices   = new DataSlice * [num_gpus];
+        d_data_slices = new DataSlice * [num_gpus];
+
+        do {
+            if (num_gpus <= 1) {
+                gpu_idx = (int*)malloc(sizeof(int));
+
+                // create a single data slice for the currently-set GPU
+                int gpu;
+                if (retval = util::GRError(
+                    cudaGetDevice(&gpu), "Problem cudaGetDevice failed",
+                    __FILE__, __LINE__)) break;
+                gpu_idx[0] = gpu;
+
+                data_slices[0] = new DataSlice;
+                if (retval = util::GRError(
+                    cudaMalloc((void**)&d_data_slices[0], sizeof(DataSlice)),
+                    "Problem cudaMalloc d_data_slices failed",
+                    __FILE__, __LINE__)) return retval;
+
+                // create SoA on device
+                VertexId *d_labels;
+                if (retval = util::GRError(
+                    cudaMalloc((void**)&d_labels, nodes * sizeof(VertexId)),
+                    "Problem cudaMalloc d_labels failed",
+                    __FILE__, __LINE__)) return retval;
+                data_slices[0]->d_labels = d_labels;
+
+                bool *d_bitmask;
+                if (retval = util::GRError(
+                    cudaMalloc((void**)&d_bitmask, nodes * sizeof(bool)),
+                    "Problem cudaMalloc d_bitmask failed",
+                    __FILE__, __LINE__)) return retval;
+                data_slices[0]->d_bitmask = d_bitmask;
+                util::MemsetKernel<<<128, 128>>>(
+                   data_slices[0]->d_bitmask, (bool)false, nodes);
+            }
+            // add multi-GPU allocation code
+        } while (0);
+
+        return retval;
+    }
+
+    /**
+     *  @brief Performs any initialization work needed for primitive
+     *  @param[in] frontier_type Frontier type (i.e., edge / vertex / mixed)
+     *  @param[in] queue_sizing Size scaling factor for work queue allocation
+     *  \return cudaError_t object indicates the success of all CUDA functions.
+     */
+    cudaError_t Reset(
+        FrontierType frontier_type,  // type (i.e., edge / vertex / mixed)
+        double queue_sizing) {
+        // size scaling factor for work queue allocation (e.g., 1.0 creates
+        // n-element and m-element vertex and edge frontiers, respectively).
+        // 0.0 is unspecified.
+
+        typedef ProblemBase<_VertexId, _SizeT, false> BaseProblem;
+
+        // load ProblemBase Reset
+        BaseProblem::Reset(frontier_type, queue_sizing);
+
+        cudaError_t retval = cudaSuccess;
+
+        for (int gpu = 0; gpu < num_gpus; ++gpu) {
+            // setting device
+            if (retval = util::GRError(
+                    cudaSetDevice(gpu_idx[gpu]),
+                    "Problem cudaSetDevice failed",
+                    __FILE__, __LINE__)) return retval;
+
+            // allocate output labels if necessary
+            if (!data_slices[gpu]->d_labels) {
+                VertexId *d_labels;
+                if (retval = util::GRError(
+                        cudaMalloc((void**)&d_labels, nodes * sizeof(VertexId)),
+                        "Problem cudaMalloc d_labels failed",
+                        __FILE__, __LINE__)) return retval;
+                data_slices[gpu]->d_labels = d_labels;
+            }
+
+            util::MemsetKernel<<< 128, 128>>>(
+                data_slices[gpu]->d_labels, -1, nodes);
+
+            if (!data_slices[gpu]->d_bitmask) {
+                bool *d_bitmask;
+                if (retval = util::GRError(cudaMalloc(
+                    (void**)&d_bitmask, nodes * sizeof(bool)),
+                    "MSTProblem cudaMalloc d_temp_value Failed",
+                    __FILE__, __LINE__)) return retval;
+                data_slices[gpu]->d_bitmask = d_bitmask;
+            }
+
+            if (retval = util::GRError(
+                    cudaMemcpy(d_data_slices[gpu],
+                               data_slices[gpu],
+                               sizeof(DataSlice),
+                               cudaMemcpyHostToDevice),
+                    "Problem cudaMemcpy data_slices to d_data_slices failed",
+                    __FILE__, __LINE__)) return retval;
+        }
+
+        // TODO: fill in the initial input_queue for problem
+        // e.g., put every vertex in frontier queue
+        util::MemsetIdxKernel<<<128, 128>>>(
+            BaseProblem::graph_slices[0]->frontier_queues.d_keys[0], nodes);
+
+        return retval;
+    }
+
+    /** @} */
+};
+
+}  // namespace vis
+}  // namespace app
+}  // namespace gunrock
+
+// Leave this at the end of the file
+// Local Variables:
+// mode:c++
+// c-file-style: "NVIDIA"
+// End:
diff --git a/tests/vis/Makefile b/tests/vis/Makefile
new file mode 100644
index 000000000..7931cd948
--- /dev/null
+++ b/tests/vis/Makefile
@@ -0,0 +1,108 @@
+# -----------------------------------------------------------------------------
+# Gunrock -- High-Performance Graph Primitives on GPU
+# -----------------------------------------------------------------------------
+# This source code is distributed under the terms of LICENSE.TXT
+# in the root directory of this source distribution.
+# -----------------------------------------------------------------------------
+# Build script for project
+# -----------------------------------------------------------------------------
+
+force64 = 1
+NVCC = "$(shell which nvcc)"
+NVCC_VERSION = $(strip $(shell nvcc --version | grep release | sed 's/.*release //' | sed 's/,.*//'))
+
+KERNELS =
+
+# detect OS
+OSUPPER = $(shell uname -s 2>/dev/null | tr [:lower:] [:upper:])
+
+# -----------------------------------------------------------------------------
+# Gen targets
+# -----------------------------------------------------------------------------
+
+GEN_SM35 = -gencode=arch=compute_35,code=\"sm_35,compute_35\"
+GEN_SM30 = -gencode=arch=compute_30,code=\"sm_30,compute_30\"
+SM_TARGETS = $(GEN_SM35)
+
+# -----------------------------------------------------------------------------
+# Libs
+# -----------------------------------------------------------------------------
+
+
+# -----------------------------------------------------------------------------
+# Includes
+# -----------------------------------------------------------------------------
+
+CUDA_INC = "$(shell dirname $(NVCC))/../include"
+MGPU_INC = "../../externals/moderngpu/include"
+INC = -I$(CUDA_INC) -I$(MGPU_INC) -I.. -I../..
+
+# -----------------------------------------------------------------------------
+# Defines
+# -----------------------------------------------------------------------------
+
+DEFINES =
+
+# -----------------------------------------------------------------------------
+# Compiler Flags
+# -----------------------------------------------------------------------------
+
+ifneq ($(force64), 1)
+	# Compile with 32-bit device pointers by default
+	ARCH_SUFFIX = i386
+	ARCH = -m32
+else
+	ARCH_SUFFIX = x86_64
+	ARCH = -m64
+endif
+
+NVCCFLAGS = -Xcudafe -\#
+
+ifeq (WIN_NT, $(findstring WIN_NT, $(OSUPPER)))
+	NVCCFLAGS += -Xcompiler /bigobj -Xcompiler /Zm500
+endif
+
+
+ifeq ($(verbose), 1)
+	NVCCFLAGS += -v
+endif
+
+ifeq ($(keep), 1)
+	NVCCFLAGS += -keep
+endif
+
+ifdef maxregisters
+	NVCCFLAGS += -maxrregcount $(maxregisters)
+endif
+
+# -----------------------------------------------------------------------------
+# Dependency Lists
+# -----------------------------------------------------------------------------
+
+DEPS = ./Makefile \
+	$(wildcard ../../gunrock/util/*.cuh) \
+	$(wildcard ../../gunrock/util/**/*.cuh) \
+	$(wildcard ../../gunrock/*.cuh) \
+	$(wildcard ../../gunrock/graphio/*.cuh) \
+	$(wildcard ../../gunrock/oprtr/*.cuh) \
+	$(wildcard ../../gunrock/oprtr/**/*.cuh) \
+	$(wildcard ../../gunrock/app/*.cuh) \
+	$(wildcard ../../gunrock/app/**/*.cuh)
+
+# -----------------------------------------------------------------------------
+# (make test) Test driver for
+# -----------------------------------------------------------------------------
+
+test: bin/test_vis_$(NVCC_VERSION)_$(ARCH_SUFFIX)
+
+bin/test_vis_$(NVCC_VERSION)_$(ARCH_SUFFIX) : test_vis.cu ../../gunrock/util/test_utils.cu ../../gunrock/util/error_utils.cu ../../externals/moderngpu/src/mgpucontext.cu ../../externals/moderngpu/src/mgpuutil.cpp $(DEPS)
+	mkdir -p bin
+	$(NVCC) $(DEFINES) $(SM_TARGETS) -o bin/test_vis_$(NVCC_VERSION)_$(ARCH_SUFFIX) test_vis.cu ../../gunrock/util/test_utils.cu ../../gunrock/util/error_utils.cu ../../externals/moderngpu/src/mgpucontext.cu ../../externals/moderngpu/src/mgpuutil.cpp $(NVCCFLAGS) $(ARCH) $(INC) -O3
+
+# -----------------------------------------------------------------------------
+# Clean
+# -----------------------------------------------------------------------------
+
+clean :
+	rm -f bin/*_$(NVCC_VERSION)_$(ARCH_SUFFIX)*
+	rm -f *.i* *.cubin *.cu.c *.cudafe* *.fatbin.c *.ptx *.hash *.cu.cpp *.o
diff --git a/tests/vis/run.sh b/tests/vis/run.sh
new file mode 100644
index 000000000..708cedfec
--- /dev/null
+++ b/tests/vis/run.sh
@@ -0,0 +1,28 @@
+#!/bin/sh
+
+# get all execution files in ./bin
+files=(./bin/*)
+# split file names into arr
+arr=$(echo $files | tr " " "\n")
+max_ver_num="$"
+exe_file=${arr[0]}
+# iterate over all file names to get the largest version number
+for x in $arr
+do
+    output=$(grep -o "[0-9]\.[0-9]" <<<"$x")
+    if [ "$output" \> "$max_ver_num" ]; then
+        exe_file=$x
+    fi
+done
+
+# put OS and Device type here
+SUFFIX="ubuntu12.04.k40c"
+
+mkdir -p eval/$SUFFIX
+
+for i in test_bc
+do
+    echo $exe_file market ../../dataset/small/$i.mtx
+    $exe_file market ../../dataset/small/$i.mtx > eval/$SUFFIX/$i.$SUFFIX.txt
+    sleep 1
+done
diff --git a/tests/vis/test_vis.cu b/tests/vis/test_vis.cu
new file mode 100644
index 000000000..3584b9bff
--- /dev/null
+++ b/tests/vis/test_vis.cu
@@ -0,0 +1,346 @@
+// ----------------------------------------------------------------------------
+// Gunrock -- High-Performance Graph Primitives on GPU
+// ----------------------------------------------------------------------------
+// This source code is distributed under the terms of LICENSE.TXT
+// in the root directory of this source distribution.
+// ----------------------------------------------------------------------------
+
+/**
+ * @file test_vis.cuh
+ * @brief Simple test driver program for Vertex-Induced Subgraph
+ */
+
+#include <stdio.h>
+#include <string>
+#include <deque>
+#include <vector>
+#include <iostream>
+
+// utilities for correctness checking
+#include <gunrock/util/test_utils.cuh>
+
+// graph construction utilities
+#include <gunrock/graphio/market.cuh>
+
+// primitive-specific headers include
+#include <gunrock/app/vis/vis_enactor.cuh>
+#include <gunrock/app/vis/vis_problem.cuh>
+#include <gunrock/app/vis/vis_functor.cuh>
+
+// gunrock abstraction graph operators
+#include <gunrock/oprtr/advance/kernel.cuh>
+#include <gunrock/oprtr/filter/kernel.cuh>
+
+#include <moderngpu.cuh>
+
+using namespace gunrock;
+using namespace gunrock::util;
+using namespace gunrock::oprtr;
+using namespace gunrock::app::vis;
+
+// ----------------------------------------------------------------------------
+// Defines, constants, globals
+// ----------------------------------------------------------------------------
+
+bool g_verbose;
+bool g_undirected;
+bool g_quick;
+bool g_stream_from_host;
+
+// ----------------------------------------------------------------------------
+// Housekeeping Routines
+// ----------------------------------------------------------------------------
+void Usage() {
+    printf(
+        " test_vis <graph type> <graph type args> [--undirected] [--quick]\n"
+        " [--device=<device_index>] [--instrumented] [--iteration-num=<num>]\n"
+        " [--v] [--traversal-mode=<0|1>] [--queue-sizing=<scale factor>]\n"
+        "Graph types and arguments:\n"
+        "  market <file>\n"
+        "    Reads a Matrix-Market coordinate-formatted graph,\n"
+        "    edges from STDIN (or from the optionally-specified file)\n"
+        "  --device=<device_index>   Set GPU device to run. [Default: 0]\n"
+        "  --undirected              Convert the graph to undirected\n"
+        "  --instrumented            Keep kernels statics [Default: Disable]\n"
+        "                            total_queued, search_depth and avg_duty\n"
+        "                            (a relative indicator of load imbalance)\n"
+        "  --quick                   Skip the CPU validation [Default: false]\n"
+        "  --queue-sizing=<factor>   Allocates a frontier queue sized at: \n"
+        "                            (graph-edges * <factor>) [Default: 1.0]\n"
+        "  --v                       Print verbose per iteration debug info\n"
+        "  --iteration-num=<number>  Number of tests to run [Default: 1]\n"
+        "  --traversal-mode=<0 | 1>  Set strategy, 0 for Load-Balanced,\n"
+        "                            1 for Dynamic-Cooperative\n"
+        "                            [Default: according to topology]\n");
+}
+
+/**
+ * @brief Displays primitive result
+ *
+ * @tparam VertexId
+ * @tparam SizeT
+ * @tparam Value
+ */
+template<typename VertexId, typename SizeT, typename Value>
+void DisplaySolution(const Csr<VertexId, Value, SizeT> &graph) {
+    // TODO: code to print out results
+}
+
+
+/**
+ * @brief Performance / Evaluation statistics
+ */
+struct Stats {
+    const char *name;
+    Statistic num_iterations;
+    Stats() : name(NULL), num_iterations() {}
+    explicit Stats(const char *name) : name(name), num_iterations() {}
+};
+
+/**
+ * @brief Displays timing and correctness statistics
+ *
+ * @tparam VertexId
+ * @tparam SizeT
+ * @tparam Value
+ *
+ * @param[in] stats Reference to the Stats object
+ * @param[in] graph Reference to the CSR graph we process on
+ */
+template<typename VertexId, typename SizeT, typename Value>
+void DisplayStats(const Stats &stats, const Csr<VertexId, Value, SizeT> &graph,
+                  const float elapsed, const long long iterations) {
+    printf("[%s] finished.\n", stats.name);
+    printf("elapsed: %.4f ms\n", elapsed);
+}
+
+// ----------------------------------------------------------------------------
+// Testing Routines
+// ----------------------------------------------------------------------------
+
+/**
+ * @brief A simple CPU-based reference implementation.
+ *
+ * @tparam VertexId
+ * @tparam SizeT
+ * @tparam Value
+ *
+ * @param[in] graph Reference to the CSR graph we process on
+ */
+template<typename VertexId, typename SizeT, typename Value>
+void SimpleReference(const Csr<VertexId, Value, SizeT> &graph) {
+    // initialization
+
+    // perform calculation
+
+    CpuTimer cpu_timer;
+    cpu_timer.Start();
+
+    // TODO: CPU validation code here
+
+    cpu_timer.Stop();
+
+    float cpu_elapsed = cpu_timer.ElapsedMillis();
+    printf("CPU reference finished in %lf ms.\n\n", cpu_elapsed);
+}
+
+/**
+ * @brief Sample test
+ *
+ * @tparam VertexId
+ * @tparam SizeT
+ * @tparam Value
+ *
+ * @param[in] graph Reference to the CSR graph we process on
+ * @param[in] max_grid_size Maximum CTA occupancy
+ * @param[in] num_gpus Number of GPUs
+ * @param[in] max_queue_sizing Scaling factor used in edge mapping
+ * @param[in] iterations Number of iterations for running the test
+ * @param[in] traversal_mode Strategy: Load-balanced or Dynamic cooperative
+ * @param[in] context CudaContext pointer for ModernGPU APIs
+ *
+ */
+template<typename VertexId, typename SizeT, typename Value, bool INSTRUMENT>
+void RunTest(
+    const Csr<VertexId, Value, SizeT> &graph,
+    int          max_grid_size,
+    int          num_gpus,
+    double       max_queue_sizing,
+    int          iterations,
+    int          traversal_mode,
+    CudaContext& context) {
+    typedef VISProblem<VertexId, SizeT, Value> Problem;
+
+    // allocate host-side array (for both reference and GPU-computed results)
+    VertexId *r_labels = (VertexId*)malloc(sizeof(VertexId) * graph.nodes);
+    VertexId *h_labels = (VertexId*)malloc(sizeof(VertexId) * graph.nodes);
+
+    // allocate primitive enactor map
+    VISEnactor<INSTRUMENT> enactor(g_verbose);
+
+    // allocate primitive problem on GPU
+    Problem *csr_problem = new Problem;
+    util::GRError(csr_problem->Init(
+                      g_stream_from_host,
+                      graph,
+                      num_gpus),
+                  "Problem Initialization Failed", __FILE__, __LINE__);
+
+    Stats *stats = new Stats("Vertex-Induced Subgraph");
+
+    // perform calculation
+    GpuTimer gpu_timer;
+
+    float elapsed = 0.0f;
+
+    for (int iter = 0; iter < iterations; ++iter) {
+        util::GRError(
+            csr_problem->Reset(enactor.GetFrontierType(),
+                               max_queue_sizing),
+            "Problem Data Reset Failed", __FILE__, __LINE__);
+        gpu_timer.Start();
+        util::GRError(
+            enactor.template Enact<Problem>(context, csr_problem,
+                max_grid_size, traversal_mode),
+            "Problem Enact Failed", __FILE__, __LINE__);
+        gpu_timer.Stop();
+        elapsed += gpu_timer.ElapsedMillis();
+    }
+
+    elapsed /= iterations;
+
+    // extract results
+    util::GRError(csr_problem->Extract(h_labels),
+        "Problem Data Extraction Failed", __FILE__, __LINE__);
+
+    // compute reference CPU validation solution
+    if (!g_quick) {
+        printf("-- computing reference value ... (currently missing)\n");
+        SimpleReference<VertexId, SizeT, Value>(graph);
+        printf("-- validation: (currently missing)\n");
+    }
+
+    // display solution
+    DisplaySolution<VertexId, SizeT, Value>(graph);
+
+    // display statistics
+    VertexId num_iteratios = 0;
+    enactor.GetStatistics(num_iteratios);
+    DisplayStats<VertexId, SizeT, Value>(*stats, graph, elapsed, num_iteratios);
+
+    // clean up
+    delete stats;
+    if (csr_problem) delete csr_problem;
+    if (r_labels)    free(r_labels);
+    if (h_labels)    free(h_labels);
+
+    cudaDeviceSynchronize();
+}
+
+/**
+ * @brief Test entry
+ *
+ * @tparam VertexId
+ * @tparam SizeT
+ * @tparam Value
+ *
+ * @param[in] graph Reference to the CSR graph we process on
+ * @param[in] args Reference to the command line arguments
+ * @param[in] context CudaContext pointer for ModernGPU APIs
+ */
+template<typename VertexId, typename SizeT, typename Value>
+void RunTest(
+    Csr<VertexId, Value, SizeT> &graph,
+    CommandLineArgs &args,
+    CudaContext& context) {
+    bool   instrumented     =   0;  // Collect instrumentation from kernels
+    int    max_grid_size    =   0;  // Maximum grid size (0: up to the enactor)
+    int    num_gpus         =   1;  // Number of GPUs for multi-GPU enactor
+    double max_queue_sizing = 1.0;  // Maximum scaling factor for work queues
+    int    iterations       =   1;  // Number of runs for testing
+    int    traversal_mode   =  -1;  // Load-balanced or Dynamic cooperative
+    g_quick                 =   0;  // Whether or not to skip CPU validation
+
+    // choose traversal mode
+    args.GetCmdLineArgument("traversal-mode", traversal_mode);
+    if (traversal_mode == -1) {
+        traversal_mode = graph.GetAverageDegree() > 8 ? 0 : 1;
+    }
+
+    g_verbose    = args.CheckCmdLineFlag("v");
+    instrumented = args.CheckCmdLineFlag("instrumented");
+    g_quick = args.CheckCmdLineFlag("quick");
+
+    args.GetCmdLineArgument("iteration-num", iterations);
+    args.GetCmdLineArgument("grid-size", max_grid_size);
+    args.GetCmdLineArgument("queue-sizing", max_queue_sizing);
+
+    if (instrumented) {
+        RunTest<VertexId, Value, SizeT, true>(
+            graph,
+            max_grid_size,
+            num_gpus,
+            max_queue_sizing,
+            iterations,
+            traversal_mode,
+            context);
+    } else {
+        RunTest<VertexId, Value, SizeT, false>(
+            graph,
+            max_grid_size,
+            num_gpus,
+            max_queue_sizing,
+            iterations,
+            traversal_mode,
+            context);
+    }
+}
+
+// ----------------------------------------------------------------------------
+// Main
+// ----------------------------------------------------------------------------
+int main(int argc, char** argv) {
+    CommandLineArgs args(argc, argv);
+    if ((argc < 2) || (args.CheckCmdLineFlag("help"))) {
+        Usage();
+        return 1;
+    }
+
+    int device = 0;
+    args.GetCmdLineArgument("device", device);
+    ContextPtr context = mgpu::CreateCudaDevice(device);
+
+    // parse graph-construction parameters
+    g_undirected = args.CheckCmdLineFlag("undirected");
+
+    std::string graph_type = argv[1];
+    int flags = args.ParsedArgc();
+    int graph_args = argc - flags - 1;
+    if (graph_args < 1) {
+        Usage();
+        return 1;
+    }
+
+    typedef int VertexId;  // Use as the vertex identifier
+    typedef int SizeT;     // Use as the graph size type
+    typedef int Value;     // Use as the value type
+
+    if (graph_type == "market") {
+        // matrix-market coordinate-formatted graph
+        Csr<VertexId, Value, SizeT> csr(false);
+        char *name = (graph_args == 2) ? argv[2] : NULL;
+        if (graphio::BuildMarketGraph<false>(
+            name, csr, g_undirected, false) != 0) {
+            return 1;
+        }
+
+        csr.DisplayGraph();    // display graph adjacent list
+        csr.PrintHistogram();  // display graph histogram
+        RunTest(csr, args, *context);  // run sample test
+
+    } else {
+        fprintf(stderr, "Unspecified graph type\n");
+        return 1;
+    }
+    return 0;
+}

From b61303a6aa5227be39707a5e214442c9d37db3e7 Mon Sep 17 00:00:00 2001
From: Yuduo Wu <yudwu@ucdavis.edu>
Date: Fri, 19 Jun 2015 11:35:23 -0700
Subject: [PATCH 18/36] cache binary csr, way much faster for testing graph i/o

---
 gunrock/coo.cuh            |  9 ++--
 gunrock/csr.cuh            | 86 ++++++++++++++------------------------
 gunrock/graphio/market.cuh | 27 ++++++------
 3 files changed, 46 insertions(+), 76 deletions(-)

diff --git a/gunrock/coo.cuh b/gunrock/coo.cuh
index e6b585a4c..008dec22b 100644
--- a/gunrock/coo.cuh
+++ b/gunrock/coo.cuh
@@ -37,8 +37,7 @@ struct Coo {
 
     Coo(VertexId row, VertexId col, Value val) : row(row), col(col), val(val) {}
 
-    void Val(Value &value)
-    {
+    void Val(Value &value) {
         value = val;
     }
 };
@@ -71,8 +70,7 @@ struct Coo<VertexId, util::NullType> {
 template<typename Coo>
 bool RowFirstTupleCompare (
     Coo elem1,
-    Coo elem2)
-{
+    Coo elem2) {
     if (elem1.row < elem2.row) {
         // Sort edges by source node
         return true;
@@ -97,8 +95,7 @@ bool RowFirstTupleCompare (
 template<typename Coo>
 bool ColumnFirstTupleCompare (
     Coo elem1,
-    Coo elem2)
-{
+    Coo elem2) {
     if (elem1.col < elem2.col) {
         // Sort edges by source node
         return true;
diff --git a/gunrock/csr.cuh b/gunrock/csr.cuh
index d4d5da2ff..0c2f46373 100644
--- a/gunrock/csr.cuh
+++ b/gunrock/csr.cuh
@@ -129,30 +129,22 @@ struct Csr {
 
     /**
      *
-     * @brief Store graph information into files
+     * @brief Store graph information into a file
      *
      */
-    void WriteToFile(
-        char     *file_name,
-        bool     undirected,
-        bool     reversed,
-        SizeT    num_nodes,
-        SizeT    num_edges,
-        SizeT    *row_offsets,
-        VertexId *col_indices,
-        Value    *edge_values = NULL) {
-        std::ofstream output(file_name);
-        if (output.is_open()) {
-            output << num_nodes << " " << num_edges << " ";
-            std::copy(row_offsets, row_offsets + num_nodes + 1,
-                      std::ostream_iterator<VertexId>(output, " "));
-            std::copy(column_indices, column_indices + num_edges,
-                      std::ostream_iterator<VertexId>(output, " "));
+    void WriteToFile(char  *file_name, SizeT v, SizeT e, SizeT *row,
+                     VertexId *col, Value *edge_values = NULL) {
+        std::ofstream fout(file_name);
+        if (fout.is_open()) {
+            fout.write(reinterpret_cast<const char*>(&v), sizeof(SizeT));
+            fout.write(reinterpret_cast<const char*>(&e), sizeof(SizeT));
+            fout.write(reinterpret_cast<const char*>(row), (v+1)*sizeof(SizeT));
+            fout.write(reinterpret_cast<const char*>(col), e*sizeof(VertexId));
             if (edge_values != NULL) {
-                std::copy(edge_values, edge_values + num_edges,
-                          std::ostream_iterator<Value>(output, " "));
+                fout.write(reinterpret_cast<const char*>(edge_values),
+                           e * sizeof(Value));
             }
-            output.close();
+            fout.close();
         }
     }
 
@@ -162,40 +154,26 @@ struct Csr {
      *
      */
     template <bool LOAD_EDGE_VALUES>
-    void FromCsr(char *f_in, bool undirected, bool reversed) {
-        printf("  Reading directly from previously stored CSR arrays ...\n");
-
-        std::ifstream _file(f_in);
-
-        if (_file.is_open()) {
-            time_t mark1 = time(NULL);
-
-            std::istream_iterator<Value> start(_file), end;
-            std::vector<Value> v(start, end);
-
-            SizeT csr_nodes = v[0];
-            SizeT csr_edges = v[1];
-
+    void FromCsr(char *f_in) {
+        printf("  Reading directly from stored binary CSR arrays ...\n");
+        time_t mark1 = time(NULL);
 
-            FromScratch<LOAD_EDGE_VALUES, false>(csr_nodes, csr_edges);
+        std::ifstream input(f_in);
+        SizeT v, e;
+        input.read(reinterpret_cast<char*>(&v), sizeof(SizeT));
+        input.read(reinterpret_cast<char*>(&e), sizeof(SizeT));
 
-            std::copy(v.begin() + 2, v.begin() + 3 + csr_nodes, row_offsets);
-            std::copy(v.begin() + 3 + csr_nodes,
-                      v.begin() + 3 + csr_nodes + csr_edges,
-                      column_indices);
-            if (LOAD_EDGE_VALUES) {
-                std::copy(v.begin() + 3 + csr_nodes + csr_edges,
-                          v.end(), edge_values);
-            }
+        FromScratch<LOAD_EDGE_VALUES, false>(v, e);
 
-            time_t mark2 = time(NULL);
-            printf("Done reading (%ds).\n", (int) (mark2 - mark1));
-
-            v.clear();
-        } else {
-            perror("Unable To Open The File.");
+        input.read(reinterpret_cast<char*>(row_offsets), (v + 1)*sizeof(SizeT));
+        input.read(reinterpret_cast<char*>(column_indices), e*sizeof(VertexId));
+        if (LOAD_EDGE_VALUES) {
+            input.read(reinterpret_cast<char*>(edge_values), e*sizeof(Value));
         }
 
+        time_t mark2 = time(NULL);
+        printf("Done reading (%ds).\n", (int) (mark2 - mark1));
+
         // compute out_nodes
         SizeT out_node = 0;
         for (SizeT node = 0; node < nodes; node++) {
@@ -286,15 +264,14 @@ struct Csr {
 
         // Write offsets, indices, node, edges etc. into file
         if (LOAD_EDGE_VALUES) {
-            WriteToFile(output_file, undirected, reversed, nodes, edges,
+            WriteToFile(output_file, nodes, edges,
                         row_offsets, column_indices, edge_values);
         } else {
-            WriteToFile(output_file, undirected, reversed, nodes, edges,
+            WriteToFile(output_file, nodes, edges,
                         row_offsets, column_indices);
         }
 
         if (new_coo) free(new_coo);
-        fflush(stdout);
 
         // Compute out_nodes
         SizeT out_node = 0;
@@ -342,7 +319,6 @@ struct Csr {
         }
         printf("\nDegree Histogram (%lld vertices, %lld edges):\n",
                (long long) nodes, (long long) edges);
-
         printf("    Degree   0: %d (%.2f%%)\n", log_counts[0],
                (float) log_counts[0] * 100.0 / nodes);
         for (int i = 0; i < max_log_length + 1; i++) {
@@ -369,7 +345,7 @@ struct Csr {
                     edge++) {
                 printf("[");
                 util::PrintValue(column_indices[edge]);
-                if (with_edge_value) {
+                if (with_edge_value && edge_values != NULL) {
                     printf(",");
                     util::PrintValue(edge_values[edge]);
                 }
@@ -438,7 +414,7 @@ struct Csr {
             double mean = 0, count = 0;
             for (SizeT node = 0; node < nodes; ++node) {
                 count += 1;
-                mean += (row_offsets[node + 1] - row_offsets[node] - mean) / count;
+                mean += (row_offsets[node+1]-row_offsets[node]-mean)/count;
             }
             average_degree = static_cast<SizeT>(mean);
         }
diff --git a/gunrock/graphio/market.cuh b/gunrock/graphio/market.cuh
index 74284f065..2fd7e92a1 100644
--- a/gunrock/graphio/market.cuh
+++ b/gunrock/graphio/market.cuh
@@ -215,12 +215,9 @@ int ReadMarketStream(
  *
  */
 template <bool LOAD_VALUES, typename VertexId, typename Value, typename SizeT>
-int ReadCsrArrays(
-    char *f_in,
-    Csr<VertexId, Value, SizeT> &csr_graph,
-    bool undirected,
-    bool reversed) {
-    csr_graph.template FromCsr<LOAD_VALUES>(f_in, undirected, reversed);
+int ReadCsrArrays(char *f_in, Csr<VertexId, Value, SizeT> &csr_graph,
+                  bool undirected, bool reversed) {
+    csr_graph.template FromCsr<LOAD_VALUES>(f_in);
     return 0;
 }
 
@@ -301,19 +298,19 @@ int BuildMarketGraph(
     char *file_name = basename(temp2);
 
     if (undirected) {
-        char ud[256];
-        sprintf(ud, "%s/.%s_undirected_csr", file_path, file_name);
-        if (BuildMarketGraph<true>(file_in, ud, graph, true, false) != 0)
+        char ud[256];  // undirected graph
+        sprintf(ud, "%s/.%s.ud.bin", file_path, file_name);
+        if (BuildMarketGraph<LOAD_VALUES>(file_in, ud, graph, true, false) != 0)
             return 1;
     } else if (!undirected && reversed) {
-        char rv[256];
-        sprintf(rv, "%s/.%s_reversed_csr", file_path, file_name);
-        if (BuildMarketGraph<true>(file_in, rv, graph, false, true) != 0)
+        char rv[256];  // reversed graph
+        sprintf(rv, "%s/.%s.rv.bin", file_path, file_name);
+        if (BuildMarketGraph<LOAD_VALUES>(file_in, rv, graph, false, true) != 0)
             return 1;
     } else if (!undirected && !reversed) {
-        char nr[256];
-        sprintf(nr, "%s/.%s_nonreversed_csr", file_path, file_name);
-        if (BuildMarketGraph<true>(file_in, nr, graph, false, false) != 0)
+        char di[256];  // directed graph
+        sprintf(di, "%s/.%s.di.bin", file_path, file_name);
+        if (BuildMarketGraph<LOAD_VALUES>(file_in, di, graph, false, false) != 0)
             return 1;
     } else {
         fprintf(stderr, "Unspecified Graph Type.\n");

From b51ee68af6ecefb060bda96a1357aad64fa9fff5 Mon Sep 17 00:00:00 2001
From: Yangzihao Wang <yzhwang@ucdavis.edu>
Date: Fri, 19 Jun 2015 14:17:45 -0700
Subject: [PATCH 19/36] Almost fixed the BC bug. still occasionally get early
 quit in advance kernel. Need to figure out why.

---
 gunrock/app/bc/bc_enactor.cuh                 | 90 +------------------
 gunrock/app/bc/bc_functor.cuh                 |  1 -
 gunrock/oprtr/edge_map_partitioned/kernel.cuh | 23 ++---
 gunrock/util/test_utils.cuh                   |  5 +-
 tests/bc/test_bc.cu                           |  8 +-
 5 files changed, 19 insertions(+), 108 deletions(-)

diff --git a/gunrock/app/bc/bc_enactor.cuh b/gunrock/app/bc/bc_enactor.cuh
index 24b4abe99..6cc6436c2 100644
--- a/gunrock/app/bc/bc_enactor.cuh
+++ b/gunrock/app/bc/bc_enactor.cuh
@@ -257,12 +257,9 @@ class BCEnactor : public EnactorBase
             //util::MemsetAddKernel<<<128, 128>>>(d_scanned_edges, (unsigned int)0, graph_slice->edges);
             // Forward BC iteration
             while (done[0] < 0) {
-
                 if (frontier_attribute.queue_length > 0 && enactor_stats.iteration > 0) {
                     SizeT cur_offset = forward_queue_offsets.top();
-                    //printf("offset:%d, current length:%d\n", cur_offset, frontier_attribute.queue_length);
                     util::MemsetCopyVectorKernel<<<128, 128>>>(&problem->data_slices[0]->d_forward_output[cur_offset], graph_slice->frontier_queues.d_keys[frontier_attribute.selector], frontier_attribute.queue_length);
-                    //util::DisplayDeviceResults(graph_slice->frontier_queues.d_keys[frontier_attribute.selector], frontier_attribute.queue_length);
                     forward_queue_offsets.push(frontier_attribute.queue_length+cur_offset);
                 }
 
@@ -374,93 +371,14 @@ class BCEnactor : public EnactorBase
                 if (DEBUG) printf("\n%lld", (long long) enactor_stats.iteration);
 
             }
-            //delete[] sigmas;
-            //delete[] labels;
-            //delete[] vids;
-            //util::DisplayDeviceResults(problem->data_slices[0]->d_forward_output, forward_queue_offsets.back());
-
-            /*enactor_stats.iteration                 = enactor_stats.iteration - 2;
-
-            frontier_attribute.queue_length        = graph_slice->nodes;
-            frontier_attribute.queue_index         = 0;        // Work queue index
-            frontier_attribute.selector            = 0;
             frontier_attribute.queue_reset         = true;
-            done[0]             = -1;
-
-            // Prepare the label array
-            VertexId            label_adjust = -enactor_stats.iteration;
-            util::MemsetAddKernel<<<128, 128>>>(problem->data_slices[0]->d_labels, label_adjust, graph_slice->nodes);*/
-
 
             if (DEBUG) printf("\nStart backward phase\n%lld", (long long) enactor_stats.iteration);
             // Backward BC iteration
             SizeT top_offset = forward_queue_offsets.top();
-            if (DEBUG) printf("top offsets:%d\n", top_offset);
             forward_queue_offsets.pop();
             while (!forward_queue_offsets.empty()) {
                 frontier_attribute.queue_length = top_offset-forward_queue_offsets.top();
-                util::DisplayDeviceResults(problem->data_slices[0]->d_sigmas, &problem->data_slices[0]->d_forward_output[forward_queue_offsets.top()], graph_slice->nodes, frontier_attribute.queue_length);
-                /*frontier_attribute.queue_length        = graph_slice->nodes;
-                // Fill in the frontier_queues
-                util::MemsetIdxKernel<<<128, 128>>>(graph_slice->frontier_queues.d_keys[0], graph_slice->nodes);
-
-                // Filter
-                gunrock::oprtr::filter::Kernel<FilterKernelPolicy, BCProblem, BackwardFunctor>
-                <<<enactor_stats.filter_grid_size, FilterKernelPolicy::THREADS>>>(
-                    -1,
-                    frontier_attribute.queue_reset,
-                    frontier_attribute.queue_index,
-                    enactor_stats.num_gpus,
-                    frontier_attribute.queue_length,
-                    d_done,
-                    graph_slice->frontier_queues.d_keys[0],      // d_in_queue
-                    NULL,
-                    graph_slice->frontier_queues.d_keys[1],    // d_out_queue
-                    data_slice,
-                    NULL,
-                    work_progress,
-                    graph_slice->nodes,           // max_in_queue
-                    graph_slice->edges,         // max_out_queue
-                    enactor_stats.filter_kernel_stats);*/
-
-
-                // Only need to reset queue for once
-                /*if (frontier_attribute.queue_reset)
-                    frontier_attribute.queue_reset = false; */
-
-                //if (/*DEBUG &&*/ (retval = util::GRError(cudaThreadSynchronize(), "edge_map_backward::Kernel failed", __FILE__, __LINE__))) break;
-                /*cudaEventQuery(throttle_event);                                 // give host memory mapped visibility to GPU updates
-
-                frontier_attribute.queue_index++;
-                frontier_attribute.selector ^= 1;
-
-                if (AdvanceKernelPolicy::ADVANCE_MODE == gunrock::oprtr::advance::LB) {
-                    if (retval = work_progress.GetQueueLength(frontier_attribute.queue_index, frontier_attribute.queue_length)) break;
-                    }
-
-                if (DEBUG) {
-                    if (retval = work_progress.GetQueueLength(frontier_attribute.queue_index, frontier_attribute.queue_length)) break;
-                    printf(", %lld", (long long) frontier_attribute.queue_length);
-                }
-
-                if (INSTRUMENT) {
-                    if (retval = enactor_stats.advance_kernel_stats.Accumulate(
-                        enactor_stats.advance_grid_size,
-                        enactor_stats.total_runtimes,
-                        enactor_stats.total_lifetimes)) break;
-                }
-
-                // Throttle
-                if (enactor_stats.iteration & 1) {
-                    if (retval = util::GRError(cudaEventRecord(throttle_event),
-                        "BCEnactor cudaEventRecord throttle_event failed", __FILE__, __LINE__)) break;
-                } else {
-                    if (retval = util::GRError(cudaEventSynchronize(throttle_event),
-                        "BCEnactor cudaEventSynchronize throttle_event failed", __FILE__, __LINE__)) break;
-                }
-
-                // Check if done
-                if (done[0] == 0) break;*/
                 // Edge Map
                 if (forward_queue_offsets.top() > 0) {
                     gunrock::oprtr::advance::LaunchKernel<AdvanceKernelPolicy, BCProblem, BackwardFunctor>(
@@ -513,11 +431,6 @@ class BCEnactor : public EnactorBase
                 if (DEBUG && (retval = util::GRError(cudaThreadSynchronize(), "filter_forward::Kernel failed", __FILE__, __LINE__))) break;
                 cudaEventQuery(throttle_event); // give host memory mapped visibility to GPU updates
 
-                //frontier_attribute.queue_index++;
-                //frontier_attribute.selector ^= 1;
-
-                //util::MemsetAddKernel<<<128, 128>>>(problem->data_slices[0]->d_labels, 1, graph_slice->nodes);
-
                 if (INSTRUMENT || DEBUG) {
                     if (retval = work_progress.GetQueueLength(frontier_attribute.queue_index, frontier_attribute.queue_length)) break;
                     if (INSTRUMENT) {
@@ -529,7 +442,6 @@ class BCEnactor : public EnactorBase
                 }
                 top_offset = forward_queue_offsets.top();
                 forward_queue_offsets.pop();
-                if (DEBUG) printf("top offsets:%d\n", top_offset);
             }
             if (retval) break;
 
@@ -593,7 +505,7 @@ class BCEnactor : public EnactorBase
                 BCProblem,                         // Problem data type
                 300,                                // CUDA_ARCH
                 INSTRUMENT,                         // INSTRUMENT
-                8,                                  // MIN_CTA_OCCUPANCY
+                1,                                  // MIN_CTA_OCCUPANCY
                 10,                                  // LOG_THREADS
                 8,                                  // LOG_BLOCKS
                 32*128,                             // LIGHT_EDGE_THRESHOLD (used for partitioned advance mode)
diff --git a/gunrock/app/bc/bc_functor.cuh b/gunrock/app/bc/bc_functor.cuh
index 6b8e594d6..2e8ed3fbc 100644
--- a/gunrock/app/bc/bc_functor.cuh
+++ b/gunrock/app/bc/bc_functor.cuh
@@ -162,7 +162,6 @@ struct BackwardFunctor
      */
     static __device__ __forceinline__ bool CondEdge(VertexId s_id, VertexId d_id, DataSlice *problem, VertexId e_id = 0, VertexId e_id_in = 0)
     {
-
         VertexId s_label;
         VertexId d_label;
         util::io::ModifiedLoad<ProblemData::COLUMN_READ_MODIFIER>::Ld(
diff --git a/gunrock/oprtr/edge_map_partitioned/kernel.cuh b/gunrock/oprtr/edge_map_partitioned/kernel.cuh
index 55e2a53b4..ba7653589 100644
--- a/gunrock/oprtr/edge_map_partitioned/kernel.cuh
+++ b/gunrock/oprtr/edge_map_partitioned/kernel.cuh
@@ -272,15 +272,12 @@ struct Dispatch<KernelPolicy, ProblemData, Functor, true>
 
                                     my_thread_start = bid * partition_size;
                                     my_thread_end = (bid+1)*partition_size < output_queue_len ? (bid+1)*partition_size : output_queue_len;
-                                    //printf("tid:%d, bid:%d, m_thread_start:%d, m_thread_end:%d\n",tid, bid, my_thread_start, my_thread_end); 
 
                                     if (my_thread_start >= output_queue_len)
                                         return;
 
                                     int my_start_partition = partition_starts[bid];
                                     int my_end_partition = partition_starts[bid+1] > input_queue_len ? partition_starts[bid+1] : input_queue_len;
-                                    //if (tid == 0 && bid == 252)
-                                    //    printf("bid(%d) < num_partitions-1(%d)?, partition_starts[bid+1]+1:%d\n", bid, num_partitions-1, partition_starts[bid+1]+1);
 
                                     __shared__ typename KernelPolicy::SmemStorage smem_storage;
                                     // smem_storage.s_edges[NT]
@@ -302,8 +299,6 @@ struct Dispatch<KernelPolicy, ProblemData, Functor, true>
                                         __syncthreads();
 
                                         s_edges[tid] = (my_start_partition + tid < my_end_partition ? d_scanned_edges[my_start_partition + tid] - pre_offset : max_edges);
-                                        //if (bid == 252 && tid == 2)
-                                        //    printf("start_partition+tid:%d < my_end_partition:%d ?, d_queue[%d]:%d\n", my_start_partition+tid, my_end_partition, my_start_partition+tid, d_queue[my_start_partition+tid]);
                                         if (ADVANCE_TYPE == gunrock::oprtr::advance::V2V || ADVANCE_TYPE == gunrock::oprtr::advance::V2E) {
                                             s_vertices[tid] = my_start_partition + tid < my_end_partition ? d_queue[my_start_partition+tid] : -1;
                                             s_edge_ids[tid] = 0;
@@ -553,12 +548,12 @@ struct Dispatch<KernelPolicy, ProblemData, Functor, true>
         }
 
         // Determine work decomposition
-        if (blockIdx.x == 0 && threadIdx.x == 0) {
+        if (blockIdx.x == 0 && threadIdx.x == 0) { 
 
             // obtain problem size
             if (queue_reset)
             {
-                work_progress.StoreQueueLength<SizeT>(input_queue_len, queue_index);
+                //work_progress.StoreQueueLength<SizeT>(input_queue_len, queue_index);
             }
             else
             {
@@ -568,17 +563,17 @@ struct Dispatch<KernelPolicy, ProblemData, Functor, true>
                 if (input_queue_len == 0) {
                     if (d_done) d_done[0] = input_queue_len;
                 }
-            }
+            } 
 
             work_progress.Enqueue(output_queue_len, queue_index+1);
 
             // Reset our next outgoing queue counter to zero
             work_progress.template StoreQueueLength<SizeT>(0, queue_index + 2);
             work_progress.template PrepResetSteal<SizeT>(queue_index + 1);
-        }
+        } 
 
         // Barrier to protect work decomposition
-        __syncthreads();
+        __syncthreads(); 
 
         unsigned int range = input_queue_len;
         int tid = threadIdx.x;
@@ -606,16 +601,16 @@ struct Dispatch<KernelPolicy, ProblemData, Functor, true>
             else
                 s_vertices[tid] = (my_id < range ? d_column_indices[d_queue[my_id]] : max_vertices);
             s_edge_ids[tid] = (my_id < range ? d_queue[my_id] : max_vertices);
-        }
+        } 
 
         __syncthreads();
-        unsigned int size = s_edges[end_id];
+        unsigned int size = s_edges[end_id]; 
 
         VertexId v, e, e_id;
         int v_index = BinarySearch<KernelPolicy::THREADS>(tid, s_edges);
         v = s_vertices[v_index];
         e_id = s_edge_ids[v_index];
-        int end_last = (v_index < KernelPolicy::THREADS ? s_edges[v_index] : max_vertices);
+        int end_last = (v_index < KernelPolicy::THREADS ? s_edges[v_index] : max_vertices); 
 
         for (int i = tid; i < size; i += KernelPolicy::THREADS)
         {
@@ -713,7 +708,7 @@ struct Dispatch<KernelPolicy, ProblemData, Functor, true>
                         }
                     }
                 }
-            } else {
+            } else { 
                 //v:pre, u:neighbor, outoffset:offset+i
                 if (Functor::CondEdge(v, u, problem, lookup, e_id)) {
                     Functor::ApplyEdge(v, u, problem, lookup, e_id);
diff --git a/gunrock/util/test_utils.cuh b/gunrock/util/test_utils.cuh
index 93f223642..11eff1ddb 100644
--- a/gunrock/util/test_utils.cuh
+++ b/gunrock/util/test_utils.cuh
@@ -446,7 +446,8 @@ int CompareResults(
                 is_right = false;
             }
         }
-        if (!is_right && flag == 0)
+        
+        if (!is_right)
         {
             printf("\nINCORRECT: [%lu]: ", (unsigned long) i);
             PrintValue<float>(computed[i]);
@@ -473,7 +474,7 @@ int CompareResults(
             flag += 1;
             //return flag;
         }
-        if (!is_right && flag > 0) flag += 1;
+        //if (!is_right && flag > 0) flag += 1;
     }
     printf("\n");
     if (!flag)
diff --git a/tests/bc/test_bc.cu b/tests/bc/test_bc.cu
index 09a3d1f2d..a3d2f5514 100644
--- a/tests/bc/test_bc.cu
+++ b/tests/bc/test_bc.cu
@@ -287,9 +287,12 @@ void RefCPUBC(
 
         for (int iter = search_depth - 2; iter > 0; --iter)
         {
+
+            int cur_level = 0;
             for (int node = 0; node < graph.nodes; ++node)
             {
                 if (source_path[node] == iter) {
+                    ++cur_level;
                     int edges_begin = graph.row_offsets[node];
                     int edges_end = graph.row_offsets[node+1];
 
@@ -306,9 +309,7 @@ void RefCPUBC(
         }
 
         for (int i = 0; i < graph.nodes; ++i)
-        {
             bc_values[i] *= 0.5f;
-        }
 
         cpu_timer.Stop();
         float elapsed = cpu_timer.ElapsedMillis();
@@ -433,6 +434,7 @@ void RunTests(
     gpu_timer.Start();
     for (VertexId i = start_src; i < end_src; ++i)
     {
+        printf("src:%d\n", i);
         util::GRError(csr_problem->Reset(i, bc_enactor.GetFrontierType(), max_queue_sizing), "BC Problem Data Reset Failed", __FILE__, __LINE__);
         util::GRError(bc_enactor.template Enact<Problem>(context, csr_problem, i, max_grid_size), "BC Problem Enact Failed", __FILE__, __LINE__);
     }
@@ -615,7 +617,9 @@ int main( int argc, char** argv)
 
         csr.PrintHistogram();
         //csr.DisplayGraph();
+        csr.DisplayNeighborList(1263);
         fflush(stdout);
+        printf("1263 row offsets:%d\n", csr.row_offsets[1263]);
 
         // Run tests
         RunTests(csr, args, *context);

From 0f0e19c0cb481e7de51acdb292d1ecaa6c180b5c Mon Sep 17 00:00:00 2001
From: Yuduo Wu <yudwu@ucdavis.edu>
Date: Fri, 19 Jun 2015 17:53:31 -0700
Subject: [PATCH 20/36] go back to prev timer which correct on large dataset

---
 gunrock/util/test_utils.h | 29 -----------------------------
 1 file changed, 29 deletions(-)

diff --git a/gunrock/util/test_utils.h b/gunrock/util/test_utils.h
index 8c5e9d573..c2433c00c 100644
--- a/gunrock/util/test_utils.h
+++ b/gunrock/util/test_utils.h
@@ -19,7 +19,6 @@
     #undef small            // Windows is terrible for polluting macro namespace
 #else
     #include <sys/resource.h>
-    #include <time.h>
 #endif
 
 #include <stdio.h>
@@ -217,34 +216,6 @@ struct CpuTimer
         return (stop - start) * 1000;
     }
 
-#elif defined(CLOCK_PROCESS_CPUTIME_ID)
-
-    timespec start;
-    timespec stop;
-
-    void Start()
-    {
-        clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &start);
-    }
-
-    void Stop()
-    {
-        clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &stop);
-    }
-
-    float ElapsedMillis()
-    {
-        timespec temp;
-        if ((stop.tv_nsec-start.tv_nsec)<0) {
-            temp.tv_sec = stop.tv_sec-start.tv_sec-1;
-            temp.tv_nsec = 1000000000+stop.tv_nsec-start.tv_nsec;
-        } else {
-            temp.tv_sec = stop.tv_sec-start.tv_sec;
-            temp.tv_nsec = stop.tv_nsec-start.tv_nsec;
-        }
-        return temp.tv_nsec/1000000.0;
-    }
-
 #else
 
     rusage start;

From 767ca47649a8db6f9b05185689ba71d1f1ee9085 Mon Sep 17 00:00:00 2001
From: Yuduo Wu <yudwu@ucdavis.edu>
Date: Sat, 20 Jun 2015 09:02:49 -0700
Subject: [PATCH 21/36] oops, wrong value type for cpu validation code

---
 tests/mst/test_mst.cu   | 10 +++++-----
 tests/sssp/test_sssp.cu |  2 +-
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/tests/mst/test_mst.cu b/tests/mst/test_mst.cu
index d74e89233..b563517ac 100644
--- a/tests/mst/test_mst.cu
+++ b/tests/mst/test_mst.cu
@@ -169,7 +169,7 @@ Value SimpleReferenceMST(
   // Kruskal minimum spanning tree preparations
   using namespace boost;
   typedef adjacency_list< vecS, vecS, undirectedS,
-    no_property, property<edge_weight_t, float> >   Graph;
+    no_property, property<edge_weight_t, int> >   Graph;
   typedef graph_traits < Graph >::edge_descriptor   Edge;
   typedef graph_traits < Graph >::vertex_descriptor Vertex;
   typedef std::pair<VertexId, VertexId> E;
@@ -312,14 +312,14 @@ void RunTests(
       // print the edge pairs in the minimum spanning tree
       DisplaySolution(graph, h_mst_output);
       printf("\nCORRECT.\n");
-      std::cout << "CPU Computed Total Weight = " << total_weight_cpu << std::endl;
-      std::cout << "GPU Computed Total Weight = " << total_weight_gpu << std::endl;
+      std::cout << "CPU Total Weight = " << total_weight_cpu << std::endl;
+      std::cout << "GPU Total Weight = " << total_weight_gpu << std::endl;
     }
     else
     {
       printf("INCORRECT.\n");
-      std::cout << "CPU Computed Total Weight = " << total_weight_cpu << std::endl;
-      std::cout << "GPU Computed Total Weight = " << total_weight_gpu << std::endl;
+      std::cout << "CPU Total Weight = " << total_weight_cpu << std::endl;
+      std::cout << "GPU Total Weight = " << total_weight_gpu << std::endl;
     }
   }
 
diff --git a/tests/sssp/test_sssp.cu b/tests/sssp/test_sssp.cu
index b084842b6..2bc204495 100644
--- a/tests/sssp/test_sssp.cu
+++ b/tests/sssp/test_sssp.cu
@@ -247,7 +247,7 @@ void SimpleReferenceSssp(
 
     // Prepare Boost Datatype and Data structure
     typedef adjacency_list<vecS, vecS, directedS, no_property,
-                           property <edge_weight_t, float> > Graph;
+                           property <edge_weight_t, unsigned int> > Graph;
 
     typedef graph_traits<Graph>::vertex_descriptor vertex_descriptor;
     typedef graph_traits<Graph>::edge_descriptor edge_descriptor;

From 29ce8a000ce8920583e2ce694555435994aad5ce Mon Sep 17 00:00:00 2001
From: Yangzihao Wang <yzhwang@ucdavis.edu>
Date: Mon, 22 Jun 2015 08:34:37 -0700
Subject: [PATCH 22/36] BC works correctly now. disabled edge bc for now
 though.

---
 gunrock/oprtr/edge_map_partitioned/kernel.cuh | 2 +-
 tests/bc/test_bc.cu                           | 8 ++------
 2 files changed, 3 insertions(+), 7 deletions(-)

diff --git a/gunrock/oprtr/edge_map_partitioned/kernel.cuh b/gunrock/oprtr/edge_map_partitioned/kernel.cuh
index ba7653589..bce9a13b2 100644
--- a/gunrock/oprtr/edge_map_partitioned/kernel.cuh
+++ b/gunrock/oprtr/edge_map_partitioned/kernel.cuh
@@ -553,7 +553,7 @@ struct Dispatch<KernelPolicy, ProblemData, Functor, true>
             // obtain problem size
             if (queue_reset)
             {
-                //work_progress.StoreQueueLength<SizeT>(input_queue_len, queue_index);
+                work_progress.StoreQueueLength<SizeT>(input_queue_len, queue_index);
             }
             else
             {
diff --git a/tests/bc/test_bc.cu b/tests/bc/test_bc.cu
index a3d2f5514..1eb3e6c20 100644
--- a/tests/bc/test_bc.cu
+++ b/tests/bc/test_bc.cu
@@ -225,7 +225,7 @@ void RefCPUBC(
 
         for (idx = 0; idx < graph.edges; ++idx) {
             //std::cout << coo[idx].row << "," << coo[idx].col << ":" << coo[idx].val << std::endl;
-            ebc_values[idx] = coo[idx].val;
+            //ebc_values[idx] = coo[idx].val;
         }
 
         printf("CPU BC finished in %lf msec.", elapsed);
@@ -369,7 +369,7 @@ void RunTests(
     Value *h_bc_values         = (Value*)malloc(sizeof(Value) * graph.nodes);
     Value *h_ebc_values         = (Value*)malloc(sizeof(Value) * graph.edges);
     Value *reference_check_bc_values = (g_quick) ? NULL : reference_bc_values;
-    Value *reference_check_ebc_values = (g_quick || (src != -1)) ? NULL : reference_ebc_values;
+    Value *reference_check_ebc_values = NULL;//(g_quick || (src != -1)) ? NULL : reference_ebc_values;
     Value *reference_check_sigmas = (g_quick || (src == -1)) ? NULL : reference_sigmas;
 
     // Allocate BC enactor map
@@ -434,7 +434,6 @@ void RunTests(
     gpu_timer.Start();
     for (VertexId i = start_src; i < end_src; ++i)
     {
-        printf("src:%d\n", i);
         util::GRError(csr_problem->Reset(i, bc_enactor.GetFrontierType(), max_queue_sizing), "BC Problem Data Reset Failed", __FILE__, __LINE__);
         util::GRError(bc_enactor.template Enact<Problem>(context, csr_problem, i, max_grid_size), "BC Problem Enact Failed", __FILE__, __LINE__);
     }
@@ -617,9 +616,6 @@ int main( int argc, char** argv)
 
         csr.PrintHistogram();
         //csr.DisplayGraph();
-        csr.DisplayNeighborList(1263);
-        fflush(stdout);
-        printf("1263 row offsets:%d\n", csr.row_offsets[1263]);
 
         // Run tests
         RunTests(csr, args, *context);

From 64a12ba2941a3c75978408b352f95e89f59501bb Mon Sep 17 00:00:00 2001
From: Yuduo Wu <yudwu@ucdavis.edu>
Date: Mon, 22 Jun 2015 08:43:13 -0700
Subject: [PATCH 23/36] fix warning

---
 gunrock/graphio/market.cuh | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/gunrock/graphio/market.cuh b/gunrock/graphio/market.cuh
index 2fd7e92a1..12c9a3235 100644
--- a/gunrock/graphio/market.cuh
+++ b/gunrock/graphio/market.cuh
@@ -129,12 +129,12 @@ int ReadMarketStream(
                 return -1;
             }
 
-            long long ll_row, ll_col;
-            Value ll_value;
+            long long ll_row, ll_col, ll_value;
+            // Value ll_value;  // used for parse float / double
             int num_input;
             if (LOAD_VALUES) {
                 if ((num_input = sscanf(
-                                     line, "%lld %lld %d",
+                                     line, "%lld %lld %lld",
                                      &ll_col, &ll_row, &ll_value)) < 2) {
                     fprintf(stderr,
                             "Error parsing MARKET graph: badly formed edge\n");

From fd3f5b9e51aeea82aa1947e540a3c7b0cd756950 Mon Sep 17 00:00:00 2001
From: wyd855 <wyd855@gmail.com>
Date: Mon, 22 Jun 2015 10:24:46 -0700
Subject: [PATCH 24/36] fix a warning

---
 gunrock/app/pr/pr_enactor.cuh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gunrock/app/pr/pr_enactor.cuh b/gunrock/app/pr/pr_enactor.cuh
index 6217704c6..1ed922bc1 100644
--- a/gunrock/app/pr/pr_enactor.cuh
+++ b/gunrock/app/pr/pr_enactor.cuh
@@ -243,7 +243,7 @@ public:
             cudaEventCreate(&start);
             cudaEventCreate(&stop);
             cudaEventRecord(start, 0);
-            SizeT frontier_attribute_queue_length = graph_slice->nodes;
+            long long frontier_attribute_queue_length = graph_slice->nodes;
 
             // Step through PageRank iterations
             while (done[0] < 0) {

From 260b3e5829fb5778470116d9a88558489b0540a9 Mon Sep 17 00:00:00 2001
From: Yuduo Wu <yudwu@ucdavis.edu>
Date: Mon, 22 Jun 2015 12:19:20 -0700
Subject: [PATCH 25/36] adding new primitve into shared library

---
 gunrock/CMakeLists.txt          |   1 +
 gunrock/app/mst/mst_app.cu      | 166 ++++++++++++++++++++++++++++++++
 gunrock/gunrock.h               |   7 ++
 shared_lib_tests/CMakeLists.txt |  17 ++--
 shared_lib_tests/test_mst.c     |  62 ++++++++++++
 5 files changed, 246 insertions(+), 7 deletions(-)
 create mode 100644 gunrock/app/mst/mst_app.cu
 create mode 100644 shared_lib_tests/test_mst.c

diff --git a/gunrock/CMakeLists.txt b/gunrock/CMakeLists.txt
index 625225964..0f1d8d6ef 100644
--- a/gunrock/CMakeLists.txt
+++ b/gunrock/CMakeLists.txt
@@ -26,6 +26,7 @@ set(CUFILES
   app/cc/cc_app.cu
   app/sssp/sssp_app.cu
   app/pr/pr_app.cu
+  app/mst/mst_app.cu
   util/test_utils.cu
   util/error_utils.cu
   ${mgpu_SOURCE_FILES})
diff --git a/gunrock/app/mst/mst_app.cu b/gunrock/app/mst/mst_app.cu
new file mode 100644
index 000000000..283345aaa
--- /dev/null
+++ b/gunrock/app/mst/mst_app.cu
@@ -0,0 +1,166 @@
+// ----------------------------------------------------------------------------
+// Gunrock -- Fast and Efficient GPU Graph Library
+// ----------------------------------------------------------------------------
+// This source code is distributed under the terms of LICENSE.TXT
+// in the root directory of this source distribution.
+// ----------------------------------------------------------------------------
+
+/**
+ * @file mst_app.cu
+ *
+ * @brief minimum spanning tree (MST) problem implementation
+ */
+
+#include <stdio.h>
+#include <gunrock/gunrock.h>
+
+// Graph construction utils
+#include <gunrock/graphio/market.cuh>
+
+// Primitive-specific includes
+#include <gunrock/app/mst/mst_enactor.cuh>
+#include <gunrock/app/mst/mst_problem.cuh>
+#include <gunrock/app/mst/mst_functor.cuh>
+
+// ModernGPU include
+#include <moderngpu.cuh>
+
+using namespace gunrock;
+using namespace gunrock::util;
+using namespace gunrock::oprtr;
+using namespace gunrock::app::mst;
+
+/**
+ * @brief run minimum spanning tree
+ *
+ * @tparam VertexId
+ * @tparam Value
+ * @tparam SizeT
+ *
+ * @param[out] graph_o GunrockGraph type output graph
+ * @param[in]  csr Reference to the CSR graph we process on
+ * @param[in]  max_grid_size Maximum CTA occupancy
+ * @param[in]  num_gpus Number of GPUs
+ * @param[in]  context moderngpu context
+ */
+template<typename VertexId, typename Value, typename SizeT>
+void run_mst(
+    GunrockGraph   *graph_o,
+    const Csr<VertexId, Value, SizeT> &csr,
+    const int      max_grid_size,
+    const int      num_gpus,
+    CudaContext    &context) {
+    typedef MSTProblem<VertexId, SizeT, Value, true> Problem;  // preperations
+    MSTEnactor<false> enactor(false);                          // enactor map
+    VertexId  *h_mst = new VertexId[csr.edges];                // host array
+    Problem *problem = new Problem;                            // problem on GPU
+    util::GRError(problem->Init(false, csr, num_gpus),
+                  "MST Problem Data Initialization Failed", __FILE__, __LINE__);
+
+    util::GRError(problem->Reset(enactor.GetFrontierType()),
+                  "MST Problem Data Reset Failed", __FILE__, __LINE__);
+
+    CpuTimer gpu_timer;
+
+    gpu_timer.Start();
+    util::GRError(enactor.template Enact<Problem>(
+                      context, problem, max_grid_size),
+                  "MST Problem Enact Failed", __FILE__, __LINE__);
+    gpu_timer.Stop();
+    float elapsed = gpu_timer.ElapsedMillis();
+
+    util::GRError(problem->Extract(h_mst),
+                  "MST Problem Data Extraction Failed", __FILE__, __LINE__);
+
+    // output mst results: 0 | 1 mask for all edges
+    graph_o->edge_values = (int*)&h_mst[0];
+
+    if (problem) { delete problem; }
+
+    cudaDeviceSynchronize();
+}
+
+/**
+ * @brief dispatch function to handle data types
+ *
+ * @param[out] graph_o  GunrockGraph type output graph
+ * @param[in]  graph_i  GunrockGraph type input graph
+ * @param[in]  configs  MST-specific configurations
+ * @param[in]  datatype data type configurations
+ * @param[in]  context  moderngpu context parameter
+ */
+void dispatch_mst(
+    GunrockGraph          *graph_o,
+    const GunrockGraph    *graph_i,
+    const GunrockConfig   configs,
+    const GunrockDataType datatype,
+    CudaContext           &context) {
+    switch (datatype.VTXID_TYPE) {
+    case VTXID_INT: {
+        switch (datatype.SIZET_TYPE) {
+        case SIZET_INT: {
+            switch (datatype.VALUE_TYPE) {
+            case VALUE_INT: {  // template type = <int, int, int>
+                // create a CSR formatted graph
+                Csr<int, int, int> csr(false);
+                csr.nodes = graph_i->num_nodes;
+                csr.edges = graph_i->num_edges;
+                csr.row_offsets    = (int*)graph_i->row_offsets;
+                csr.column_indices = (int*)graph_i->col_indices;
+                csr.edge_values    = (int*)graph_i->edge_values;
+                // configurations if necessary
+                int num_gpus      = 1;  // number of GPU(s) to use
+                int max_grid_size = 0;  // leave it up tp the enactor
+                run_mst<int, int, int>(
+                    graph_o, csr, max_grid_size, num_gpus, context);
+                // reset for free memory
+                csr.row_offsets = NULL;
+                csr.column_indices = NULL;
+                csr.edge_values = NULL;
+                break;
+            }
+            case VALUE_UINT: {  // template type = <int, uint, int>
+                printf("Not Yet Support This DataType Combination.\n");
+                break;
+            }
+            case VALUE_FLOAT: {  // template type = <int, float, int>
+                printf("Not Yet Support This DataType Combination.\n");
+                break;
+            }
+            }
+            break;
+        }
+        }
+        break;
+    }
+    }
+}
+
+/**
+ * @brief run_mst entry
+ *
+ * @tparam VertexId
+ * @tparam Value
+ * @tparam SizeT
+ *
+ * @param[out] graph_o  GunrockGraph type output graph
+ * @param[in]  graph_i  GunrockGraph type input graph
+ * @param[in]  configs  Gunrock primitive-specific configurations
+ * @param[in]  datatype data type configurations
+ */
+void gunrock_mst(
+    GunrockGraph          *graph_o,
+    const GunrockGraph    *graph_i,
+    const GunrockConfig    configs,
+    const GunrockDataType  datatype) {
+    int device = 0;  // default use GPU 0
+    device = configs.device;
+    ContextPtr context = mgpu::CreateCudaDevice(device);
+    dispatch_mst(graph_o, graph_i, configs, datatype, *context);
+}
+
+// Leave this at the end of the file
+// Local Variables:
+// mode:c++
+// c-file-style: "NVIDIA"
+// End:
diff --git a/gunrock/gunrock.h b/gunrock/gunrock.h
index ee695951f..96f1ddf41 100644
--- a/gunrock/gunrock.h
+++ b/gunrock/gunrock.h
@@ -143,6 +143,13 @@ void gunrock_topk_func(
     struct GunrockConfig      configs,
     struct GunrockDataType    data_type);
 
+// Minimum spanning tree
+void gunrock_mst(
+    struct GunrockGraph       *graph_out,
+    const struct GunrockGraph *graph_in,
+    struct GunrockConfig      configs,
+    struct GunrockDataType    data_type);
+
 // TODO: Add other algorithms
 
 #ifdef __cplusplus
diff --git a/shared_lib_tests/CMakeLists.txt b/shared_lib_tests/CMakeLists.txt
index 3d3f638db..96d2ee37a 100644
--- a/shared_lib_tests/CMakeLists.txt
+++ b/shared_lib_tests/CMakeLists.txt
@@ -1,20 +1,23 @@
 # gunrock test rig cmake file
 # include_directories(${gunrock_INCLUDE_DIRS}/gunrock)
 
-add_executable (test_topk test_topk.c)
+add_executable(test_topk test_topk.c)
 target_link_libraries(test_topk gunrock)
 
-add_executable (test_bfs test_bfs.c)
+add_executable(test_bfs test_bfs.c)
 target_link_libraries(test_bfs gunrock)
 
-add_executable (test_bc test_bc.c)
+add_executable(test_bc test_bc.c)
 target_link_libraries(test_bc gunrock)
 
-add_executable (test_cc test_cc.c)
+add_executable(test_cc test_cc.c)
 target_link_libraries(test_cc gunrock)
 
-add_executable (test_sssp test_sssp.c)
+add_executable(test_sssp test_sssp.c)
 target_link_libraries(test_sssp gunrock)
 
-add_executable (test_pr test_pr.c)
-target_link_libraries(test_pr gunrock)
\ No newline at end of file
+add_executable(test_pr test_pr.c)
+target_link_libraries(test_pr gunrock)
+
+add_executable(test_mst test_mst.c)
+target_link_libraries(test_mst gunrock)
\ No newline at end of file
diff --git a/shared_lib_tests/test_mst.c b/shared_lib_tests/test_mst.c
new file mode 100644
index 000000000..47592a206
--- /dev/null
+++ b/shared_lib_tests/test_mst.c
@@ -0,0 +1,62 @@
+/**
+ * @brief MST test for shared library
+ * @file test_mst.c
+ *
+ * set input graph, configs and call function gunrock_mst
+ * return per node or per edge values in graph_out node_values
+ */
+
+#include <stdio.h>
+#include <gunrock/gunrock.h>
+
+int main(int argc, char* argv[])
+{
+  // set problem data types
+  struct GunrockDataType dt;
+  dt.VTXID_TYPE = VTXID_INT;
+  dt.SIZET_TYPE = SIZET_INT;
+  dt.VALUE_TYPE = VALUE_INT;
+
+  // configurations (optional)
+  struct GunrockConfig configs;
+  configs.device = 0;
+
+  // tiny sample graph
+  size_t num_nodes = 7;
+  size_t num_edges = 26;
+  int row_offsets[8]  = {0, 3, 6, 11, 15, 19, 23, 26};
+  int col_indices[26] = {1, 2, 3, 0, 2, 4, 0, 1, 3, 4, 5, 0, 2,
+                         5, 6, 1, 2, 5, 6, 2, 3, 4, 6, 3, 4, 5};
+  int edge_values[26] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
+
+  // build graph as input
+  struct GunrockGraph *graph_input =
+    (struct GunrockGraph*)malloc(sizeof(struct GunrockGraph));
+  graph_input->num_nodes   = num_nodes;
+  graph_input->num_edges   = num_edges;
+  graph_input->row_offsets = (void*)&row_offsets[0];
+  graph_input->col_indices = (void*)&col_indices[0];
+  graph_input->edge_values = (void*)&edge_values[0];
+
+  // malloc output graph
+  struct GunrockGraph *graph_output =
+    (struct GunrockGraph*)malloc(sizeof(struct GunrockGraph));
+
+  // call MST
+  gunrock_mst(graph_output, graph_input, configs, dt);
+
+  // demo test print
+  printf("Demo Outputs:\n");
+  int *mst_mask = (int*)malloc(sizeof(int) * num_edges);
+  mst_mask = (int*)graph_output->edge_values;
+  int edge;
+  for (edge = 0; edge < num_edges; ++edge) {
+    printf("Edge ID [%d] : Label [%d]\n", edge, mst_mask[edge]);
+  }
+
+  if (graph_input)  { free(graph_input);  }
+  if (graph_output) { free(graph_output); }
+
+  return 0;
+}

From 46db6fb5f255ee6764ae023e658ab43ceb20be97 Mon Sep 17 00:00:00 2001
From: Yuduo Wu <yudwu@ucdavis.edu>
Date: Tue, 23 Jun 2015 07:53:56 -0700
Subject: [PATCH 26/36] sssp add int support, mst add float support, refactor
 .so

---
 CMakeLists.txt                |   2 +-
 gunrock/app/bc/bc_app.cu      | 188 +++++++++---------------
 gunrock/app/bfs/bfs_app.cu    | 188 ++++++++++--------------
 gunrock/app/cc/cc_app.cu      | 142 +++++++-----------
 gunrock/app/mst/mst_app.cu    | 129 +++++++++--------
 gunrock/app/pr/pr_app.cu      | 211 +++++++++++----------------
 gunrock/app/pr/pr_enactor.cuh |   6 +-
 gunrock/app/sssp/sssp_app.cu  | 263 +++++++++++++++++++---------------
 gunrock/app/topk/topk_app.cu  | 149 +++++++------------
 gunrock/gunrock.h             | 178 +++++++++++------------
 shared_lib_tests/test_bc.c    | 107 +++++++-------
 shared_lib_tests/test_bfs.c   | 110 +++++++-------
 shared_lib_tests/test_cc.c    | 103 ++++++-------
 shared_lib_tests/test_mst.c   | 101 +++++++------
 shared_lib_tests/test_pr.c    | 118 +++++++--------
 shared_lib_tests/test_sssp.c  | 120 +++++++---------
 shared_lib_tests/test_topk.c  | 107 +++++++-------
 tests/hits/CMakeLists.txt     |   2 +-
 tests/mst/test_mst.cu         |   2 +-
 19 files changed, 998 insertions(+), 1228 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 82a315c20..a58744b37 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -193,7 +193,7 @@ add_test(NAME TestSSSP COMMAND test_sssp)
 set_tests_properties(TestSSSP
   PROPERTIES PASS_REGULAR_EXPRESSION "Node ID.*1.*: Label.*39.*: Predecessor.*0")
 
-add_test(NAME TestPR COMMAND test_pr --undirected)
+add_test(NAME TestPR COMMAND test_pr)
 set_tests_properties(TestPR
   PROPERTIES PASS_REGULAR_EXPRESSION "Node ID.*2.*: Page Rank.*0.357069.")
 
diff --git a/gunrock/app/bc/bc_app.cu b/gunrock/app/bc/bc_app.cu
index fb70e9d11..af413d79b 100644
--- a/gunrock/app/bc/bc_app.cu
+++ b/gunrock/app/bc/bc_app.cu
@@ -8,16 +8,15 @@
 /**
  * @file bc_app.cu
  *
- * @brief Gunrock Betweeness Centrality Implementation
+ * @brief Gunrock betweeness centrality (BC) application
  */
 
-#include <stdio.h>
 #include <gunrock/gunrock.h>
 
-// Graph construction utils
+// graph construction utilities
 #include <gunrock/graphio/market.cuh>
 
-// BC includes
+// betweeness centrality includes
 #include <gunrock/app/bc/bc_enactor.cuh>
 #include <gunrock/app/bc/bc_problem.cuh>
 #include <gunrock/app/bc/bc_functor.cuh>
@@ -36,150 +35,112 @@ using namespace gunrock::app::bc;
  * @tparam Value
  * @tparam SizeT
  *
- * @param[out] ggraph_out Pointer to the output CSR graph object
- * @param[in] graph Reference to the CSR graph object defined in main driver
+ * @param[out] graph_o Pointer to the output CSR graph object
+ * @param[in] csr Reference to the CSR graph object defined in main driver
  * @param[in] source
  * @param[in] max_grid_size
  * @param[in] num_gpus
  * @param[in] max_queue_sizing
  * @param[in] context Reference to CudaContext used by moderngpu functions
  */
-template <
-    typename VertexId,
-    typename Value,
-    typename SizeT >
+template<typename VertexId, typename Value, typename SizeT>
 void run_bc(
-    GunrockGraph *ggraph_out,
-    const Csr<VertexId, Value, SizeT> &graph,
-    VertexId source,
-    int      max_grid_size,
-    int      num_gpus,
-    double   max_queue_sizing,
-    CudaContext& context) {
-    typedef BCProblem <
-        VertexId,
-        SizeT,
-        Value,
-        true, // MARK_PREDECESSORS
-        false > Problem; //does not use double buffer
-
+    GRGraph        *graph_o,
+    const Csr<VertexId, Value, SizeT> &csr,
+    const VertexId source,
+    const int      max_grid_size,
+    const int      num_gpus,
+    const double   max_queue_sizing,
+    CudaContext    &context) {
+    typedef BCProblem<VertexId, SizeT, Value, true, false > Problem;
     // Allocate host-side array (for both reference and gpu-computed results)
-    Value *h_sigmas     = (Value*)malloc(sizeof(Value) * graph.nodes);
-    Value *h_bc_values  = (Value*)malloc(sizeof(Value) * graph.nodes);
-    Value *h_ebc_values = (Value*)malloc(sizeof(Value) * graph.edges);
-
-    // Allocate BC enactor map
-    BCEnactor<false> bc_enactor(false);
+    Value *h_sigmas     = (Value*)malloc(sizeof(Value) * csr.nodes);
+    Value *h_bc_values  = (Value*)malloc(sizeof(Value) * csr.nodes);
+    Value *h_ebc_values = (Value*)malloc(sizeof(Value) * csr.edges);
+    BCEnactor<false> enactor(false);  // Allocate BC enactor map
+    Problem *problem = new Problem;   // Allocate problem on GPU
 
-    // Allocate problem on GPU
-    Problem *csr_problem = new Problem;
-    util::GRError(csr_problem->Init(
-                      false,
-                      graph,
-                      num_gpus),
+    util::GRError(problem->Init(false, csr, num_gpus),
                   "BC Problem Initialization Failed", __FILE__, __LINE__);
 
-    // Perform BC
-    GpuTimer gpu_timer;
-
     VertexId start_source;
     VertexId end_source;
     if (source == -1) {
         start_source = 0;
-        end_source = graph.nodes;
+        end_source = csr.nodes;
     } else {
         start_source = source;
         end_source = source + 1;
     }
 
-    gpu_timer.Start();
     for (VertexId i = start_source; i < end_source; ++i) {
-        util::GRError(csr_problem->Reset(
-                          i, bc_enactor.GetFrontierType(), max_queue_sizing),
+        util::GRError(problem->Reset(
+                          i, enactor.GetFrontierType(), max_queue_sizing),
                       "BC Problem Data Reset Failed", __FILE__, __LINE__);
-        util::GRError(bc_enactor.template Enact<Problem>(
-                          context, csr_problem, i, max_grid_size),
+        util::GRError(enactor.template Enact<Problem>(
+                          context, problem, i, max_grid_size),
                       "BC Problem Enact Failed", __FILE__, __LINE__);
     }
 
     util::MemsetScaleKernel <<< 128, 128>>>(
-        csr_problem->data_slices[0]->d_bc_values, (Value)0.5f, (int)graph.nodes);
-
-    gpu_timer.Stop();
-
-    float elapsed = gpu_timer.ElapsedMillis();
+        problem->data_slices[0]->d_bc_values, (Value)0.5f, (int)csr.nodes);
 
-    //double avg_duty = 0.0;
-    //bc_enactor.GetStatistics(avg_duty);
-
-    // Copy out results to Host Device
-    util::GRError(csr_problem->Extract(h_sigmas, h_bc_values, h_ebc_values),
+    util::GRError(problem->Extract(h_sigmas, h_bc_values, h_ebc_values),
                   "BC Problem Data Extraction Failed", __FILE__, __LINE__);
 
-    // copy h_bc_values per node to GunrockGraph output
-    ggraph_out->node_values = (float*)&h_bc_values[0];
-    // copy h_ebc_values per edge to GunrockGraph output
-    ggraph_out->edge_values = (float*)&h_ebc_values[0];
-
-    printf("GPU Betweeness Centrality finished in %lf msec.\n", elapsed);
-
-    // Cleanup
-    if (csr_problem) delete csr_problem;
-    //if (h_sigmas) free(h_sigmas);
-    //if (h_bc_values) free(h_bc_values);
+    graph_o->node_values = (float*)&h_bc_values[0];   // h_bc_values per node 
+    graph_o->edge_values = (float*)&h_ebc_values[0];  // h_ebc_values per edge
 
+    if (problem) { delete problem; }
     cudaDeviceSynchronize();
 }
 
 /**
  * @brief dispatch function to handle data_types
  *
- * @param[out] ggraph_out GunrockGraph type output
- * @param[in]  ggraph_in  GunrockGraph type input graph
- * @param[in]  bc_config  bc specific configurations
- * @param[in]  data_type  bc data_type configurations
- * @param[in]  context    moderngpu context
+ * @param[out] graph_o  GRGraph type output
+ * @param[in]  graph_i  GRGraph type input graph
+ * @param[in]  config   Specific configurations
+ * @param[in]  data_t   Data type configurations
+ * @param[in]  context  ModernGPU context
  */
 void dispatch_bc(
-    GunrockGraph       *ggraph_out,
-    const GunrockGraph *ggraph_in,
-    GunrockConfig      bc_config,
-    GunrockDataType    data_type,
-    CudaContext&       context) {
-    switch (data_type.VTXID_TYPE) {
+    GRGraph       *graph_o,
+    const GRGraph *graph_i,
+    const GRSetup  config,
+    const GRTypes  data_t,
+    CudaContext   &context) {
+    switch (data_t.VTXID_TYPE) {
     case VTXID_INT: {
-        switch (data_type.SIZET_TYPE) {
+        switch (data_t.SIZET_TYPE) {
         case SIZET_INT: {
-            switch (data_type.VALUE_TYPE) {
-            case VALUE_INT: {
-                // template type = <int, int, int>
+            switch (data_t.VALUE_TYPE) {
+            case VALUE_INT: {  // template type = <int, int, int>
                 // not support yet
                 printf("Not Yet Support This DataType Combination.\n");
                 break;
             }
-            case VALUE_UINT: {
-                // template type = <int, uint, int>
+            case VALUE_UINT: {  // template type = <int, uint, int>
                 // not support yet
                 printf("Not Yet Support This DataType Combination.\n");
                 break;
             }
-            case VALUE_FLOAT: {
-                // template type = <int, float, int>
+            case VALUE_FLOAT: {  // template type = <int, float, int>
                 // build input csr format graph
                 Csr<int, float, int> csr_graph(false);
-                csr_graph.nodes = ggraph_in->num_nodes;
-                csr_graph.edges = ggraph_in->num_edges;
-                csr_graph.row_offsets    = (int*)ggraph_in->row_offsets;
-                csr_graph.column_indices = (int*)ggraph_in->col_indices;
+                csr_graph.nodes = graph_i->num_nodes;
+                csr_graph.edges = graph_i->num_edges;
+                csr_graph.row_offsets    = (int*)graph_i->row_offsets;
+                csr_graph.column_indices = (int*)graph_i->col_indices;
 
                 // bc configurations
-                int   src_node         =  -1; //!< Use whatever the specified graph-type's default is
-                int   max_grid_size    =   0; //!< maximum grid size (0: leave it up to the enactor)
-                int   num_gpus         =   1; //!< Number of GPUs for multi-gpu enactor to use
-                float max_queue_sizing = 1.0; //!< Maximum size scaling factor for work queues
+                int   src_node         =  -1;  // default source vertex to start
+                int   max_grid_size    =   0;  // leave it up to the enactor
+                int   num_gpus         =   1;  // Number of GPUs for multi-gpu
+                float max_queue_sizing = 1.0;  // Maximum size scaling factor
 
                 // determine source vertex to start bc
-                switch (bc_config.src_mode) {
+                switch (config.src_mode) {
                 case randomize: {
                     src_node = graphio::RandomNode(csr_graph.nodes);
                     break;
@@ -190,7 +151,7 @@ void dispatch_bc(
                     break;
                 }
                 case manually: {
-                    src_node = bc_config.src_node;
+                    src_node = config.src_node;
                     break;
                 }
                 default: {
@@ -198,11 +159,11 @@ void dispatch_bc(
                     break;
                 }
                 }
-                max_queue_sizing = bc_config.queue_size;
+                max_queue_sizing = config.queue_size;
 
                 // lunch bc function
                 run_bc<int, float, int>(
-                    ggraph_out,
+                    graph_o,
                     csr_graph,
                     src_node,
                     max_grid_size,
@@ -227,29 +188,20 @@ void dispatch_bc(
 /*
  * @brief gunrock_bc function
  *
- * @param[out] ggraph_out output of bc problem
- * @param[in]  ggraph_in  input graph need to process on
- * @param[in]  bc_config  gunrock primitive specific configurations
- * @param[in]  data_type  gunrock datatype struct
+ * @param[out] graph_o output of bc problem
+ * @param[in]  graph_i input graph need to process on
+ * @param[in]  config  gunrock primitive specific configurations
+ * @param[in]  data_t  gunrock data_t struct
  */
-void gunrock_bc_func(
-    GunrockGraph       *ggraph_out,
-    const GunrockGraph *ggraph_in,
-    GunrockConfig      bc_config,
-    GunrockDataType    data_type) {
-
-    // moderngpu preparations
-    int device = 0;
-    device = bc_config.device;
+void gunrock_bc(
+    GRGraph       *graph_o,
+    const GRGraph *graph_i,
+    const GRSetup  config,
+    const GRTypes  data_t) {
+    unsigned int device = 0;
+    device = config.device;
     ContextPtr context = mgpu::CreateCudaDevice(device);
-
-    // lunch dispatch function
-    dispatch_bc(
-        ggraph_out,
-        ggraph_in,
-        bc_config,
-        data_type,
-        *context);
+    dispatch_bc(graph_o, graph_i, config, data_t, *context);
 }
 
 // Leave this at the end of the file
diff --git a/gunrock/app/bfs/bfs_app.cu b/gunrock/app/bfs/bfs_app.cu
index 026c7d6fc..1fe0300b5 100644
--- a/gunrock/app/bfs/bfs_app.cu
+++ b/gunrock/app/bfs/bfs_app.cu
@@ -8,21 +8,19 @@
 /**
  * @file bfs_app.cu
  *
- * @brief Gunrock Breadth-First Search implementation
+ * @brief Gunrock breadth-first search (BFS) application
  */
 
-#include <stdio.h>
 #include <gunrock/gunrock.h>
 
-// Graph construction utils
+// graph construction utilities
 #include <gunrock/graphio/market.cuh>
 
-// BFS includes
+// breadth-first search includes
 #include <gunrock/app/bfs/bfs_enactor.cuh>
 #include <gunrock/app/bfs/bfs_problem.cuh>
 #include <gunrock/app/bfs/bfs_functor.cuh>
 
-// MGPU include
 #include <moderngpu.cuh>
 
 using namespace gunrock;
@@ -39,8 +37,8 @@ using namespace gunrock::app::bfs;
  * @tparam MARK_PREDECESSORS
  * @tparam ENABLE_IDEMPOTENCE
  *
- * @param[out] ggraph_out Pointer to the output CSR graph
- * @param[in] ggraph_in Reference to the CSR graph we process on
+ * @param[out] graph_o Pointer to the output CSR graph
+ * @param[in] graph_i Reference to the CSR graph we process on
  * @param[in] src Source node where BFS starts
  * @param[in] max_grid_size Maximum CTA occupancy
  * @param[in] num_gpus Number of GPUs
@@ -48,115 +46,87 @@ using namespace gunrock::app::bfs;
  * @param[in] context Reference to CudaContext used by moderngpu functions
  *
  */
-template <
-    typename VertexId,
-    typename Value,
-    typename SizeT,
-    bool MARK_PREDECESSORS,
-    bool ENABLE_IDEMPOTENCE >
+template<typename VertexId, typename Value, typename SizeT, 
+         bool MARK_PREDECESSORS, bool ENABLE_IDEMPOTENCE >
 void run_bfs(
-    GunrockGraph *ggraph_out,
-    const  Csr<VertexId, Value, SizeT> &ggraph_in,
-    const  VertexId src,
-    int    max_grid_size,
-    int    num_gpus,
-    double max_queue_sizing,
-    CudaContext& context) {
-    // Preparations
-    typedef BFSProblem <
-        VertexId,
-        SizeT,
-        Value,
-        MARK_PREDECESSORS,
-        ENABLE_IDEMPOTENCE,
-        (MARK_PREDECESSORS && ENABLE_IDEMPOTENCE) > Problem;
-
+    GRGraph      *graph_o,
+    const  Csr<VertexId, Value, SizeT> &csr,
+    const        VertexId src,
+    const int    max_grid_size,
+    const int    num_gpus,
+    const double max_queue_sizing,
+    CudaContext  &context) {
+    typedef BFSProblem<VertexId, SizeT, Value, MARK_PREDECESSORS, 
+        ENABLE_IDEMPOTENCE, (MARK_PREDECESSORS && ENABLE_IDEMPOTENCE)> Problem;
     // Allocate host-side label array for gpu-computed results
-    VertexId *h_labels = (VertexId*)malloc(sizeof(VertexId) * ggraph_in.nodes);
+    VertexId *h_labels = (VertexId*)malloc(sizeof(VertexId) * csr.nodes);
     VertexId *h_preds = NULL;
     if (MARK_PREDECESSORS) {
-        //h_preds = (VertexId*)malloc(sizeof(VertexId) * ggraph_in.nodes);
+        //h_preds = (VertexId*)malloc(sizeof(VertexId) * csr.nodes);
     }
 
-    // Allocate BFS enactor map
-    BFSEnactor<false> bfs_enactor(false);
-
-    // Allocate problem on GPU
-    Problem *csr_problem = new Problem;
-    util::GRError(csr_problem->Init(
-                      false,
-                      ggraph_in,
-                      num_gpus),
-                  "Problem BFS Initialization Failed", __FILE__, __LINE__);
+    BFSEnactor<false> enactor(false);  // Allocate BFS enactor map
+    Problem *problem = new Problem;    // Allocate problem on GPU
 
-    // Perform BFS
-    GpuTimer gpu_timer;
+    util::GRError(problem->Init(false, csr, num_gpus),
+                  "BFS Problem Initialization Failed", __FILE__, __LINE__);
 
-    util::GRError(csr_problem->Reset(
-                      src, bfs_enactor.GetFrontierType(), max_queue_sizing),
+    util::GRError(problem->Reset(
+                      src, enactor.GetFrontierType(), max_queue_sizing),
                   "BFS Problem Data Reset Failed", __FILE__, __LINE__);
 
-    gpu_timer.Start();
-    util::GRError(bfs_enactor.template Enact<Problem>(
-                      context, csr_problem, src, max_grid_size),
+    util::GRError(enactor.template Enact<Problem>(
+                      context, problem, src, max_grid_size),
                   "BFS Problem Enact Failed", __FILE__, __LINE__);
-    gpu_timer.Stop();
-
-    float elapsed = gpu_timer.ElapsedMillis();
 
-    // Copy out results back to Host
-    util::GRError(csr_problem->Extract(h_labels, h_preds),
+    util::GRError(problem->Extract(h_labels, h_preds),
                   "BFS Problem Data Extraction Failed", __FILE__, __LINE__);
 
-    // label per node to GunrockGraph struct
-    ggraph_out->node_values = (int*)&h_labels[0];
+    graph_o->node_values = (int*)&h_labels[0];  // label per node to GRGraph struct
 
-    // Clean up
-    if (csr_problem) delete csr_problem;
+    if (problem) delete problem;
     //if (h_preds)     free(h_preds);
-
     cudaDeviceSynchronize();
 }
 
 /**
  * @brief dispatch function to handle data_types
  *
- * @param[out] ggraph_out GunrockGraph type output
- * @param[in]  ggraph_in  GunrockGraph type input graph
- * @param[in]  bfs_config bfs specific configurations
- * @param[in]  data_type  bfs data_type configurations
- * @param[in]  context    moderngpu context
+ * @param[out] graph_o  GRGraph type output
+ * @param[in]  graph_i  GRGraph type input graph
+ * @param[in]  config   Specific configurations
+ * @param[in]  data_t   Data type configurations
+ * @param[in]  context  ModernGPU context
  */
 void dispatch_bfs(
-    GunrockGraph       *ggraph_out,
-    const GunrockGraph *ggraph_in,
-    GunrockConfig      bfs_config,
-    GunrockDataType    data_type,
-    CudaContext&       context) {
-    switch (data_type.VTXID_TYPE) {
+    GRGraph       *graph_o,
+    const GRGraph *graph_i,
+    const GRSetup  config,
+    const GRTypes  data_t,
+    CudaContext   &context) {
+    switch (data_t.VTXID_TYPE) {
     case VTXID_INT: {
-        switch (data_type.SIZET_TYPE) {
+        switch (data_t.SIZET_TYPE) {
         case SIZET_INT: {
-            switch (data_type.VALUE_TYPE) {
-            case VALUE_INT: {
-                // template type = <int, int, int>
+            switch (data_t.VALUE_TYPE) {
+            case VALUE_INT: {  // template type = <int, int, int>
                 // build input csr format graph
                 Csr<int, int, int> csr_graph(false);
-                csr_graph.nodes = ggraph_in->num_nodes;
-                csr_graph.edges = ggraph_in->num_edges;
-                csr_graph.row_offsets    = (int*)ggraph_in->row_offsets;
-                csr_graph.column_indices = (int*)ggraph_in->col_indices;
+                csr_graph.nodes = graph_i->num_nodes;
+                csr_graph.edges = graph_i->num_edges;
+                csr_graph.row_offsets    = (int*)graph_i->row_offsets;
+                csr_graph.column_indices = (int*)graph_i->col_indices;
 
                 // default configurations
-                int   src_node      = 0;       //!< default source vertex to start
-                int   num_gpus      = 1;       //!< number of GPUs for multi-gpu enactor to use
-                int   max_grid_size = 0;       //!< maximum grid size (0: leave it up to the enactor)
-                bool  mark_pred     = false;   //!< whether to mark predecessor or not
-                bool  idempotence   = false;   //!< whether or not to enable idempotence
-                float max_queue_sizing = 1.0f; //!< maximum size scaling factor for work queues
+                int   src_node      = 0;  // default source vertex to start
+                int   num_gpus      = 1;  // number of GPUs for multi-gpu
+                int   max_grid_size = 0;  // leave it up to the enactor
+                bool  mark_pred     = 0;  // whether to mark predecessor or not
+                bool  idempotence   = 0;  // whether or not enable idempotence
+                float max_queue_sizing = 1.0f;  // maximum size scaling factor
 
                 // determine source vertex to start bfs
-                switch (bfs_config.src_mode) {
+                switch (config.src_mode) {
                 case randomize: {
                     src_node = graphio::RandomNode(csr_graph.nodes);
                     break;
@@ -167,7 +137,7 @@ void dispatch_bfs(
                     break;
                 }
                 case manually: {
-                    src_node = bfs_config.src_node;
+                    src_node = config.src_node;
                     break;
                 }
                 default: {
@@ -175,14 +145,14 @@ void dispatch_bfs(
                     break;
                 }
                 }
-                mark_pred        = bfs_config.mark_pred;
-                idempotence      = bfs_config.idempotence;
-                max_queue_sizing = bfs_config.queue_size;
+                mark_pred        = config.mark_pred;
+                idempotence      = config.idempotence;
+                max_queue_sizing = config.queue_size;
 
                 if (mark_pred) {
                     if (idempotence) {
                         run_bfs<int, int, int, true, true>(
-                            ggraph_out,
+                            graph_o,
                             csr_graph,
                             src_node,
                             max_grid_size,
@@ -191,7 +161,7 @@ void dispatch_bfs(
                             context);
                     } else {
                         run_bfs<int, int, int, true, false>(
-                            ggraph_out,
+                            graph_o,
                             csr_graph,
                             src_node,
                             max_grid_size,
@@ -202,7 +172,7 @@ void dispatch_bfs(
                 } else {
                     if (idempotence) {
                         run_bfs<int, int, int, false, true>(
-                            ggraph_out,
+                            graph_o,
                             csr_graph,
                             src_node,
                             max_grid_size,
@@ -211,7 +181,7 @@ void dispatch_bfs(
                             context);
                     } else {
                         run_bfs<int, int, int, false, false>(
-                            ggraph_out,
+                            graph_o,
                             csr_graph,
                             src_node,
                             max_grid_size,
@@ -225,14 +195,12 @@ void dispatch_bfs(
                 csr_graph.column_indices = NULL;
                 break;
             }
-            case VALUE_UINT: {
-                // template type = <int, uint, int>
+            case VALUE_UINT: {  // template type = <int, uint, int>
                 // not yet support
                 printf("Not Yet Support This DataType Combination.\n");
                 break;
             }
-            case VALUE_FLOAT: {
-                // template type = <int, float, int>
+            case VALUE_FLOAT: {  // template type = <int, float, int>
                 // not yet support
                 printf("Not Yet Support This DataType Combination.\n");
                 break;
@@ -249,24 +217,20 @@ void dispatch_bfs(
 /*
  * @brief gunrock_bfs function
  *
- * @param[out] ggraph_out output subgraph of bfs problem
- * @param[in]  ggraph_in  input graph need to process on
- * @param[in]  bfs_config gunrock primitive specific configurations
- * @param[in]  data_type  gunrock datatype struct
+ * @param[out] graph_o output subgraph of bfs problem
+ * @param[in]  graph_i input graph need to process on
+ * @param[in]  config  gunrock primitive specific configurations
+ * @param[in]  data_t  gunrock data_t struct
  */
-void gunrock_bfs_func(
-    GunrockGraph       *ggraph_out,
-    const GunrockGraph *ggraph_in,
-    GunrockConfig      bfs_config,
-    GunrockDataType    data_type) {
-
-    // moderngpu preparations
-    int device = 0;
-    device = bfs_config.device;
+void gunrock_bfs(
+    GRGraph       *graph_o,
+    const GRGraph *graph_i,
+    const GRSetup  config,
+    const GRTypes  data_t) {
+    unsigned int device = 0;
+    device = config.device;
     ContextPtr context = mgpu::CreateCudaDevice(device);
-
-    // launch dispatch function
-    dispatch_bfs(ggraph_out, ggraph_in, bfs_config, data_type, *context);
+    dispatch_bfs(graph_o, graph_i, config, data_t, *context);
 }
 
 // Leave this at the end of the file
diff --git a/gunrock/app/cc/cc_app.cu b/gunrock/app/cc/cc_app.cu
index 97723087c..1f49d0e2b 100644
--- a/gunrock/app/cc/cc_app.cu
+++ b/gunrock/app/cc/cc_app.cu
@@ -8,20 +8,15 @@
 /**
  * @file cc_app.cu
  *
- * @brief connected component implementation.
+ * @brief connected component (CC) application
  */
 
-#include <stdio.h>
-#include <string>
-#include <deque>
-#include <vector>
-#include <iostream>
 #include <gunrock/gunrock.h>
 
-// Graph construction utils
+// graph construction utilities
 #include <gunrock/graphio/market.cuh>
 
-// CC includes
+// connected component includes
 #include <gunrock/app/cc/cc_enactor.cuh>
 #include <gunrock/app/cc/cc_problem.cuh>
 #include <gunrock/app/cc/cc_functor.cuh>
@@ -38,112 +33,83 @@ using namespace gunrock::app::cc;
  * @tparam Value
  * @tparam SizeT
  *
- * @param[out] ggraph_out Pointer to output CSR graph
+ * @param[out] graph_o Pointer to output CSR graph
  * @param[in] csr_graph Reference to the CSR graph we process on
  * @param[in] max_grid_size Maximum CTA occupancy for CC kernels
  * @param[in] num_gpus Number of GPUs
  */
-template <
-    typename VertexId,
-    typename Value,
-    typename SizeT >
+template<typename VertexId, typename Value, typename SizeT>
 void run_cc(
-    GunrockGraph *ggraph_out,
+    GRGraph      *graph_o,
     unsigned int *components,
-    const Csr<VertexId, Value, SizeT> &csr_graph,
+    const Csr<VertexId, Value, SizeT> &csr,
     const int    max_grid_size,
     const int    num_gpus) {
-
-    // Define CCProblem
-    typedef CCProblem <
-        VertexId,
-        SizeT,
-        Value,
-        true > Problem; //use double buffer
+    typedef CCProblem<VertexId, SizeT, Value, true> Problem; // double buffer
 
     // Allocate host-side label array for gpu-computed results
     VertexId *h_component_ids
-        = (VertexId*)malloc(sizeof(VertexId) * csr_graph.nodes);
-
-    // Allocate CC enactor map
-    CCEnactor<false> cc_enactor(false);
+        = (VertexId*)malloc(sizeof(VertexId) * csr.nodes);    
+    CCEnactor<false> cc_enactor(false);  // Allocate CC enactor map
+    Problem *problem = new Problem;  // Allocate problem on GPU
 
-    // Allocate problem on GPU
-    Problem *csr_problem = new Problem;
-    util::GRError(csr_problem->Init(
-                      false,
-                      csr_graph,
-                      num_gpus),
+    util::GRError(problem->Init(false, csr, num_gpus),
                   "CC Problem Initialization Failed", __FILE__, __LINE__);
 
-    // Reset CC Problem Data
-    util::GRError(csr_problem->Reset(
+    util::GRError(problem->Reset(
                       cc_enactor.GetFrontierType()),
                   "CC Problem Data Reset Failed", __FILE__, __LINE__);
 
-    // Perform Connected Component
-    GpuTimer gpu_timer;
-    gpu_timer.Start();
-    // Lunch CC Enactor
     util::GRError(cc_enactor.template Enact<Problem>(
-                      csr_problem, max_grid_size),
+                      problem, max_grid_size),
                   "CC Problem Enact Failed", __FILE__, __LINE__);
-    gpu_timer.Stop();
-    float elapsed = gpu_timer.ElapsedMillis();
 
-    // Copy out results back to Host Device
-    util::GRError(csr_problem->Extract(h_component_ids),
+    util::GRError(problem->Extract(h_component_ids),
                   "CC Problem Data Extraction Failed", __FILE__, __LINE__);
 
     // Compute number of components in graph
-    unsigned int temp = csr_problem->num_components;
+    unsigned int temp = problem->num_components;
     *components = temp;
 
-    // copy component_id per node to GunrockGraph struct
-    ggraph_out->node_values = (int*)&h_component_ids[0];
-
-    printf("GPU Connected Component finished in %lf msec.\n", elapsed);
-
-    // Cleanup
-    if (csr_problem)  delete csr_problem;
+    // copy component_id per node to GRGraph struct
+    graph_o->node_values = (int*)&h_component_ids[0];
 
+    if (problem)  delete problem;
     cudaDeviceSynchronize();
 }
 
 /**
  * @brief dispatch function to handle data_types
  *
- * @param[out] ggraph_out GunrockGraph type output
- * @param[in]  ggraph_in  GunrockGraph type input graph
- * @param[in]  cc_config  cc specific configurations
- * @param[in]  data_type  data type configurations
+ * @param[out] graph_o GRGraph type output
+ * @param[in]  graph_i GRGraph type input graph
+ * @param[in]  config  cc specific configurations
+ * @param[in]  data_t  data type configurations
  */
 void dispatch_cc(
-    GunrockGraph          *ggraph_out,
-    unsigned int          *components,
-    const GunrockGraph    *ggraph_in,
-    const GunrockConfig   cc_config,
-    const GunrockDataType data_type) {
-    switch (data_type.VTXID_TYPE) {
+    GRGraph       *graph_o,
+    unsigned int  *components,
+    const GRGraph *graph_i,
+    const GRSetup  config,
+    const GRTypes  data_t) {
+    switch (data_t.VTXID_TYPE) {
     case VTXID_INT: {
-        switch (data_type.SIZET_TYPE) {
+        switch (data_t.SIZET_TYPE) {
         case SIZET_INT: {
-            switch (data_type.VALUE_TYPE) {
-            case VALUE_INT: {
-                // template type = <int, int, int>
+            switch (data_t.VALUE_TYPE) {
+            case VALUE_INT: {  // template type = <int, int, int>
                 // build input csr format graph
                 Csr<int, int, int> csr_graph(false);
-                csr_graph.nodes = ggraph_in->num_nodes;
-                csr_graph.edges = ggraph_in->num_edges;
-                csr_graph.row_offsets    = (int*)ggraph_in->row_offsets;
-                csr_graph.column_indices = (int*)ggraph_in->col_indices;
+                csr_graph.nodes = graph_i->num_nodes;
+                csr_graph.edges = graph_i->num_edges;
+                csr_graph.row_offsets    = (int*)graph_i->row_offsets;
+                csr_graph.column_indices = (int*)graph_i->col_indices;
 
-                int max_grid_size = 0; //!< 0: leave it up to the enactor
-                int num_gpus      = 1; //!< number of GPUs
+                int max_grid_size = 0;  // 0: leave it up to the enactor
+                int num_gpus      = 1;  // number of GPUs
 
-                // lunch cc dispatch function
                 run_cc<int, int, int>(
-                    ggraph_out,
+                    graph_o,
                     (unsigned int*)components,
                     csr_graph,
                     max_grid_size,
@@ -154,13 +120,11 @@ void dispatch_cc(
                 csr_graph.column_indices = NULL;
                 break;
             }
-            case VALUE_UINT: {
-                // template type = <int, uint, int>
+            case VALUE_UINT: {  // template type = <int, uint, int>
                 printf("Not Yet Support This DataType Combination.\n");
                 break;
             }
-            case VALUE_FLOAT: {
-                // template type = <int, float, int>
+            case VALUE_FLOAT: {  // template type = <int, float, int>
                 printf("Not Yet Support This DataType Combination.\n");
                 break;
             }
@@ -176,20 +140,18 @@ void dispatch_cc(
 /*
  * @brief gunrock_cc function
  *
- * @param[out] ggraph_out output subgraph of cc problem
- * @param[in]  ggraph_in  input graph need to process on
- * @param[in]  cc_configs primitive specific configurations
- * @param[in]  data_type  gunrock data_type struct
+ * @param[out] graph_o output subgraph of cc problem
+ * @param[in]  graph_i input graph need to process on
+ * @param[in]  config  primitive specific configurations
+ * @param[in]  data_t  gunrock data_t struct
  */
-void gunrock_cc_func(
-    GunrockGraph          *ggraph_out,
-    unsigned int          *components,
-    const GunrockGraph    *ggraph_in,
-    const GunrockConfig   cc_configs,
-    const GunrockDataType data_type) {
-
-    // lunch dispatch function
-    dispatch_cc(ggraph_out, components, ggraph_in, cc_configs, data_type);
+void gunrock_cc(
+    GRGraph       *graph_o,
+    unsigned int  *components,
+    const GRGraph *graph_i,
+    const GRSetup  config,
+    const GRTypes  data_t) {
+    dispatch_cc(graph_o, components, graph_i, config, data_t);
 }
 
 // Leave this at the end of the file
diff --git a/gunrock/app/mst/mst_app.cu b/gunrock/app/mst/mst_app.cu
index 283345aaa..55e350471 100644
--- a/gunrock/app/mst/mst_app.cu
+++ b/gunrock/app/mst/mst_app.cu
@@ -8,21 +8,19 @@
 /**
  * @file mst_app.cu
  *
- * @brief minimum spanning tree (MST) problem implementation
+ * @brief minimum spanning tree (MST) application
  */
 
-#include <stdio.h>
 #include <gunrock/gunrock.h>
 
-// Graph construction utils
+// graph construction utilities
 #include <gunrock/graphio/market.cuh>
 
-// Primitive-specific includes
+// primitive-specific includes
 #include <gunrock/app/mst/mst_enactor.cuh>
 #include <gunrock/app/mst/mst_problem.cuh>
 #include <gunrock/app/mst/mst_functor.cuh>
 
-// ModernGPU include
 #include <moderngpu.cuh>
 
 using namespace gunrock;
@@ -37,43 +35,37 @@ using namespace gunrock::app::mst;
  * @tparam Value
  * @tparam SizeT
  *
- * @param[out] graph_o GunrockGraph type output graph
- * @param[in]  csr Reference to the CSR graph we process on
- * @param[in]  max_grid_size Maximum CTA occupancy
- * @param[in]  num_gpus Number of GPUs
- * @param[in]  context moderngpu context
+ * @param[out] graph_o   GRGraph type output graph
+ * @param[in]  csr       Reference to the CSR graph we process on
+ * @param[in]  max_grid  Maximum CTA occupancy
+ * @param[in]  num_gpus  Number of GPUs
+ * @param[in]  context   Modern GPU context
  */
 template<typename VertexId, typename Value, typename SizeT>
 void run_mst(
-    GunrockGraph   *graph_o,
+    GRGraph *graph_o,
     const Csr<VertexId, Value, SizeT> &csr,
-    const int      max_grid_size,
-    const int      num_gpus,
-    CudaContext    &context) {
-    typedef MSTProblem<VertexId, SizeT, Value, true> Problem;  // preperations
+    const int    max_grid,
+    const int    num_gpus,
+    CudaContext  &context) {
+    typedef MSTProblem<VertexId, SizeT, Value, true> Problem;  // preparations
     MSTEnactor<false> enactor(false);                          // enactor map
-    VertexId  *h_mst = new VertexId[csr.edges];                // host array
+    VertexId *h_mst  = new VertexId[csr.edges];                // results array
     Problem *problem = new Problem;                            // problem on GPU
+
     util::GRError(problem->Init(false, csr, num_gpus),
-                  "MST Problem Data Initialization Failed", __FILE__, __LINE__);
+                  "MST Data Initialization Failed", __FILE__, __LINE__);
 
     util::GRError(problem->Reset(enactor.GetFrontierType()),
-                  "MST Problem Data Reset Failed", __FILE__, __LINE__);
-
-    CpuTimer gpu_timer;
+                  "MST Data Reset Failed", __FILE__, __LINE__);
 
-    gpu_timer.Start();
-    util::GRError(enactor.template Enact<Problem>(
-                      context, problem, max_grid_size),
-                  "MST Problem Enact Failed", __FILE__, __LINE__);
-    gpu_timer.Stop();
-    float elapsed = gpu_timer.ElapsedMillis();
+    util::GRError(enactor.template Enact<Problem>(context, problem, max_grid),
+                  "MST Enact Failed", __FILE__, __LINE__);
 
     util::GRError(problem->Extract(h_mst),
-                  "MST Problem Data Extraction Failed", __FILE__, __LINE__);
+                  "MST Data Extraction Failed", __FILE__, __LINE__);
 
-    // output mst results: 0 | 1 mask for all edges
-    graph_o->edge_values = (int*)&h_mst[0];
+    graph_o->edge_values = (int*)&h_mst[0];  // output: 0|1 mask for all edges
 
     if (problem) { delete problem; }
 
@@ -83,23 +75,23 @@ void run_mst(
 /**
  * @brief dispatch function to handle data types
  *
- * @param[out] graph_o  GunrockGraph type output graph
- * @param[in]  graph_i  GunrockGraph type input graph
- * @param[in]  configs  MST-specific configurations
- * @param[in]  datatype data type configurations
- * @param[in]  context  moderngpu context parameter
+ * @param[out] graph_o  GRGraph type output graph
+ * @param[in]  graph_i  GRGraph type input graph
+ * @param[in]  config   MST-specific configurations
+ * @param[in]  data_t   Data type configurations
+ * @param[in]  context  Modern GPU context parameter
  */
 void dispatch_mst(
-    GunrockGraph          *graph_o,
-    const GunrockGraph    *graph_i,
-    const GunrockConfig   configs,
-    const GunrockDataType datatype,
+    GRGraph          *graph_o,
+    const GRGraph    *graph_i,
+    const GRSetup   config,
+    const GRTypes data_t,
     CudaContext           &context) {
-    switch (datatype.VTXID_TYPE) {
+    switch (data_t.VTXID_TYPE) {
     case VTXID_INT: {
-        switch (datatype.SIZET_TYPE) {
+        switch (data_t.SIZET_TYPE) {
         case SIZET_INT: {
-            switch (datatype.VALUE_TYPE) {
+            switch (data_t.VALUE_TYPE) {
             case VALUE_INT: {  // template type = <int, int, int>
                 // create a CSR formatted graph
                 Csr<int, int, int> csr(false);
@@ -108,23 +100,42 @@ void dispatch_mst(
                 csr.row_offsets    = (int*)graph_i->row_offsets;
                 csr.column_indices = (int*)graph_i->col_indices;
                 csr.edge_values    = (int*)graph_i->edge_values;
+
                 // configurations if necessary
-                int num_gpus      = 1;  // number of GPU(s) to use
-                int max_grid_size = 0;  // leave it up tp the enactor
+                int num_gpus = 1;  // number of GPU(s) to use
+                int max_grid = 0;  // leave it up to the enactor
                 run_mst<int, int, int>(
-                    graph_o, csr, max_grid_size, num_gpus, context);
+                    graph_o, csr, max_grid, num_gpus, context);
+
                 // reset for free memory
-                csr.row_offsets = NULL;
+                csr.row_offsets    = NULL;
                 csr.column_indices = NULL;
-                csr.edge_values = NULL;
+                csr.edge_values    = NULL;
                 break;
             }
-            case VALUE_UINT: {  // template type = <int, uint, int>
+            case VALUE_UINT: {  // template type = <int, unsigned int, int>
                 printf("Not Yet Support This DataType Combination.\n");
                 break;
             }
             case VALUE_FLOAT: {  // template type = <int, float, int>
-                printf("Not Yet Support This DataType Combination.\n");
+                // create a CSR formatted graph
+                Csr<int, float, int> csr(false);
+                csr.nodes = graph_i->num_nodes;
+                csr.edges = graph_i->num_edges;
+                csr.row_offsets    = (int*)graph_i->row_offsets;
+                csr.column_indices = (int*)graph_i->col_indices;
+                csr.edge_values  = (float*)graph_i->edge_values;
+
+                // configurations if necessary
+                int num_gpus = 1;  // number of GPU(s) to use
+                int max_grid = 0;  // leave it up to the enactor
+                run_mst<int, float, int>(
+                    graph_o, csr, max_grid, num_gpus, context);
+
+                // reset for free memory
+                csr.row_offsets    = NULL;
+                csr.column_indices = NULL;
+                csr.edge_values    = NULL;
                 break;
             }
             }
@@ -143,20 +154,20 @@ void dispatch_mst(
  * @tparam Value
  * @tparam SizeT
  *
- * @param[out] graph_o  GunrockGraph type output graph
- * @param[in]  graph_i  GunrockGraph type input graph
- * @param[in]  configs  Gunrock primitive-specific configurations
- * @param[in]  datatype data type configurations
+ * @param[out] graph_o GRGraph type output graph
+ * @param[in]  graph_i GRGraph type input graph
+ * @param[in]  config  Primitive-specific configurations
+ * @param[in]  data_t  Data type configurations
  */
 void gunrock_mst(
-    GunrockGraph          *graph_o,
-    const GunrockGraph    *graph_i,
-    const GunrockConfig    configs,
-    const GunrockDataType  datatype) {
-    int device = 0;  // default use GPU 0
-    device = configs.device;
+    GRGraph       *graph_o,
+    const GRGraph *graph_i,
+    const GRSetup  config,
+    const GRTypes  data_t) {
+    unsigned int device = 0;
+    device = config.device;
     ContextPtr context = mgpu::CreateCudaDevice(device);
-    dispatch_mst(graph_o, graph_i, configs, datatype, *context);
+    dispatch_mst(graph_o, graph_i, config, data_t, *context);
 }
 
 // Leave this at the end of the file
diff --git a/gunrock/app/pr/pr_app.cu b/gunrock/app/pr/pr_app.cu
index 8a7200595..47a9e5862 100644
--- a/gunrock/app/pr/pr_app.cu
+++ b/gunrock/app/pr/pr_app.cu
@@ -8,21 +8,19 @@
 /**
  * @file pr_app.cu
  *
- * @brief Gunrock PageRank Implementation
+ * @brief Gunrock PageRank application
  */
 
-#include <stdio.h>
 #include <gunrock/gunrock.h>
 
-// Graph construction utils
+// graph construction utilities
 #include <gunrock/graphio/market.cuh>
 
-// Page Rank includes
+// page-rank includes
 #include <gunrock/app/pr/pr_enactor.cuh>
 #include <gunrock/app/pr/pr_problem.cuh>
 #include <gunrock/app/pr/pr_functor.cuh>
 
-// Moderngpu include
 #include <moderngpu.cuh>
 
 using namespace gunrock;
@@ -31,33 +29,30 @@ using namespace gunrock::oprtr;
 using namespace gunrock::app::pr;
 
 /**
- * @brief run page rank
+ * @brief run page-rank
  *
  * @tparam VertexId
  * @tparam Value
  * @tparam SizeT
  *
- * @param[out] ggraph_out Pointer to output CSR graph
+ * @param[out] graph_o Pointer to output CSR graph
  * @param[out] node_ids Pointer to output node IDs
  * @param[out] page_rank Pointer to output PageRanks
- * @param[in] graph Reference to the CSR graph we process on
- * @param[in] source Source ID for personalized PageRank (-1 for general PageRank)
- * @param[in] delta Delta value for computing Page Rank, usually set to .85
+ * @param[in] csr Reference to the CSR graph we process on
+ * @param[in] source Source ID for personalized PR (-1 for general PageRank)
+ * @param[in] delta Delta value for computing PageRank, usually set to 0.85
  * @param[in] error Error threshold value
  * @param[in] max_iter Max iteration for Page Rank computing
  * @param[in] max_grid_size Maximum CTA occupancy
  * @param[in] num_gpus Number of GPUs
  * @param[in] context CudaContext for moderngpu to use
  */
-template <
-    typename VertexId,
-    typename Value,
-    typename SizeT >
-void run_page_rank(
-    GunrockGraph   *ggraph_out,
+template<typename VertexId, typename Value, typename SizeT>
+ void run_pagerank(
+    GRGraph        *graph_o,
     VertexId       *node_ids,
     Value          *page_rank,
-    const Csr<VertexId, Value, SizeT> &graph,
+    const Csr<VertexId, Value, SizeT> &csr,
     const VertexId source,
     const Value    delta,
     const Value    error,
@@ -65,106 +60,78 @@ void run_page_rank(
     const int      max_grid_size,
     const int      num_gpus,
     CudaContext&   context) {
-    typedef PRProblem <
-        VertexId,
-        SizeT,
-        Value > Problem;
-
-    // Allocate host-side label array for gpu-computed results
-    //Value    *h_rank    = (Value*)malloc(sizeof(Value) * graph.nodes);
-    //VertexId *h_node_id = (VertexId*)malloc(sizeof(VertexId) * graph.nodes);
-
-    // Allocate Page Rank enactor map
-    PREnactor<false> pr_enactor(false);
-
-    // Allocate problem on GPU
-    Problem *csr_problem = new Problem;
-    util::GRError(csr_problem->Init(
-                      false,
-                      graph,
-                      num_gpus),
-                  "PageRank Problem Initialization Failed", __FILE__, __LINE__);
-
-    // Perform PageRank
-    GpuTimer gpu_timer;
-
-    util::GRError(csr_problem->Reset(
-                      source, delta, error, pr_enactor.GetFrontierType()),
-                  "PageRank Problem Data Reset Failed", __FILE__, __LINE__);
-    gpu_timer.Start();
-    util::GRError(pr_enactor.template Enact<Problem>(
-                      context, csr_problem, max_iter, max_grid_size),
-                  "PageRank Problem Enact Failed", __FILE__, __LINE__);
-    gpu_timer.Stop();
-
-    float elapsed = gpu_timer.ElapsedMillis();
-
-    // Copy out results
-    util::GRError(csr_problem->Extract(page_rank, node_ids),
-                  "PageRank Problem Data Extraction Failed",
-                  __FILE__, __LINE__);
-
-    // Cleanup
-    if (csr_problem) delete csr_problem;
-    //if (h_node_id)   free(h_node_id);
-    //if (h_rank)      free(h_rank);
+    typedef PRProblem<VertexId, SizeT, Value> Problem;
+    PREnactor<false> enactor(false);  // PageRank enactor map
+    Problem *problem = new Problem;   // Allocate problem on GPU
 
+    util::GRError(problem->Init(false, csr, num_gpus),
+                  "PR Problem Initialization Failed", __FILE__, __LINE__);
+
+    util::GRError(problem->Reset(
+                      source, delta, error, enactor.GetFrontierType()),
+                  "PR Problem Data Reset Failed", __FILE__, __LINE__);
+
+    util::GRError(enactor.template Enact<Problem>(
+                      context, problem, max_iter, max_grid_size),
+                  "PR Problem Enact Failed", __FILE__, __LINE__);
+
+    util::GRError(problem->Extract(page_rank, node_ids),
+                  "PR Problem Extraction Failed", __FILE__, __LINE__);
+
+    if (problem) delete problem;
     cudaDeviceSynchronize();
 }
 
 /**
  * @brief dispatch function to handle data_types
  *
- * @param[out] ggraph_out output of pr problem
+ * @param[out] graph_o    output of pr problem
  * @param[out] node_ids   output of pr problem
  * @param[out] page_rank  output of pr problem
- * @param[in]  ggraph_in  GunrockGraph type input graph
- * @param[in]  pr_config  pr specific configurations
- * @param[in]  data_type  data type configurations
+ * @param[in]  graph_i    GRGraph type input graph
+ * @param[in]  config     specific configurations
+ * @param[in]  data_t     data type configurations
  * @param[in]  context    moderngpu context
  */
-void dispatch_page_rank(
-    GunrockGraph          *ggraph_out,
-    void                  *node_ids,
-    void                  *page_rank,
-    const GunrockGraph    *ggraph_in,
-    const GunrockConfig   pr_config,
-    const GunrockDataType data_type,
-    CudaContext&          context) {
-    switch (data_type.VTXID_TYPE) {
+void dispatch_pagerank(
+    GRGraph       *graph_o,
+    void          *node_ids,
+    void          *pagerank,
+    const GRGraph *graph_i,
+    const GRSetup  config,
+    const GRTypes  data_t,
+    CudaContext   &context) {
+    switch (data_t.VTXID_TYPE) {
     case VTXID_INT: {
-        switch (data_type.SIZET_TYPE) {
+        switch (data_t.SIZET_TYPE) {
         case SIZET_INT: {
-            switch (data_type.VALUE_TYPE) {
-            case VALUE_INT: {
-                // template type = <int, int, int>
+            switch (data_t.VALUE_TYPE) {
+            case VALUE_INT: {  // template type = <int, int, int>
                 printf("Not Yet Support This DataType Combination.\n");
                 break;
             }
-            case VALUE_UINT: {
-                // template type = <int, uint, int>
+            case VALUE_UINT: {  // template type = <int, uint, int>
                 printf("Not Yet Support This DataType Combination.\n");
                 break;
             }
-            case VALUE_FLOAT: {
-                // template type = <int, float, int>
+            case VALUE_FLOAT: {  // template type = <int, float, int>
                 // build input csr format graph
                 Csr<int, float, int> csr_graph(false);
-                csr_graph.nodes          = ggraph_in->num_nodes;
-                csr_graph.edges          = ggraph_in->num_edges;
-                csr_graph.row_offsets    = (int*)ggraph_in->row_offsets;
-                csr_graph.column_indices = (int*)ggraph_in->col_indices;
-
-                // page rank configurations
-                float delta         = 0.85f; //!< default delta value
-                float error         = 0.01f; //!< error threshold
-                int   max_iter      = 20;    //!< maximum number of iterations
-                int   max_grid_size = 0;     //!< 0: leave it up to the enactor
-                int   num_gpus      = 1;     //!< for multi-gpu enactor to use
-                int   src_node      = -1;    //!< source node to start
+                csr_graph.nodes          = graph_i->num_nodes;
+                csr_graph.edges          = graph_i->num_edges;
+                csr_graph.row_offsets    = (int*)graph_i->row_offsets;
+                csr_graph.column_indices = (int*)graph_i->col_indices;
+
+                // page-rank configurations
+                float delta         = 0.85f;  // default delta value
+                float error         = 0.01f;  // error threshold
+                int   max_iter      = 20;     // maximum number of iterations
+                int   max_grid_size = 0;      // 0: leave it up to the enactor
+                int   num_gpus      = 1;      // for multi-gpu enactor to use
+                int   src_node      = -1;     // source node to start
 
                 // determine source vertex to start sssp
-                switch (pr_config.src_mode) {
+                switch (config.src_mode) {
                 case randomize: {
                     src_node = graphio::RandomNode(csr_graph.nodes);
                     break;
@@ -175,7 +142,7 @@ void dispatch_page_rank(
                     break;
                 }
                 case manually: {
-                    src_node = pr_config.src_node;
+                    src_node = config.src_node;
                     break;
                 }
                 default: {
@@ -183,14 +150,14 @@ void dispatch_page_rank(
                     break;
                 }
                 }
-                delta    = pr_config.delta;
-                error    = pr_config.error;
-                max_iter = pr_config.max_iter;
+                delta    = config.delta;
+                error    = config.error;
+                max_iter = config.max_iter;
 
-                run_page_rank<int, float, int>(
-                    ggraph_out,
+                run_pagerank<int, float, int>(
+                    graph_o,
                     (int*)node_ids,
-                    (float*)page_rank,
+                    (float*)pagerank,
                     csr_graph,
                     src_node,
                     delta,
@@ -215,37 +182,27 @@ void dispatch_page_rank(
 }
 
 /**
- * @brief run_page_rank entry
+ * @brief run_pr entry
  *
- * @param[out] ggraph_out output of pr problem
+ * @param[out] graph_o    output of pr problem
  * @param[out] node_ids   output of pr problem
  * @param[out] page_rank  output of pr problem
- * @param[in]  ggraph_in  input graph need to process on
- * @param[in]  pr_config  gunrock primitive specific configurations
- * @param[in]  data_type  gunrock datatype struct
+ * @param[in]  graph_i    input graph need to process on
+ * @param[in]  config     gunrock primitive specific configurations
+ * @param[in]  data_t     gunrock data_t struct
  */
-void gunrock_pr_func(
-    GunrockGraph          *ggraph_out,
-    void                  *node_ids,
-    void                  *page_rank,
-    const GunrockGraph    *ggraph_in,
-    const GunrockConfig   pr_config,
-    const GunrockDataType data_type) {
-
-    // moderngpu preparations
-    int device = 0;
-    device = pr_config.device;
+void gunrock_pagerank(
+    GRGraph       *graph_o,
+    void          *node_ids,
+    void          *pagerank,
+    const GRGraph *graph_i,
+    const GRSetup  config,
+    const GRTypes  data_t) {
+    unsigned int device = 0;
+    device = config.device;
     ContextPtr context = mgpu::CreateCudaDevice(device);
-
-    // luanch dispatch function
-    dispatch_page_rank(
-        ggraph_out,
-        node_ids,
-        page_rank,
-        ggraph_in,
-        pr_config,
-        data_type,
-        *context);
+    dispatch_pagerank(
+        graph_o, node_ids, pagerank, graph_i, config, data_t, *context);
 }
 
 // Leave this at the end of the file
diff --git a/gunrock/app/pr/pr_enactor.cuh b/gunrock/app/pr/pr_enactor.cuh
index 1ed922bc1..423196b92 100644
--- a/gunrock/app/pr/pr_enactor.cuh
+++ b/gunrock/app/pr/pr_enactor.cuh
@@ -243,7 +243,7 @@ public:
             cudaEventCreate(&start);
             cudaEventCreate(&stop);
             cudaEventRecord(start, 0);
-            long long frontier_attribute_queue_length = graph_slice->nodes;
+            SizeT frontier_attribute_queue_length = graph_slice->nodes;
 
             // Step through PageRank iterations
             while (done[0] < 0) {
@@ -283,7 +283,7 @@ public:
                     if (retval = work_progress.GetQueueLength(
                             frontier_attribute.queue_index+1,
                             frontier_attribute_queue_length)) break;
-                    printf(", %lld",
+                    printf(", %d",
                            (long long) frontier_attribute_queue_length);
                 }
 
@@ -360,7 +360,7 @@ public:
                     enactor_stats.total_queued +=
                         frontier_attribute_queue_length;
                     if (DEBUG) {
-                        printf(", %lld", frontier_attribute_queue_length);
+                        printf(", %d", frontier_attribute_queue_length);
                     }
                     if (INSTRUMENT) {
                         if (retval=enactor_stats.filter_kernel_stats.Accumulate(
diff --git a/gunrock/app/sssp/sssp_app.cu b/gunrock/app/sssp/sssp_app.cu
index fa55888be..fd02d0b3c 100644
--- a/gunrock/app/sssp/sssp_app.cu
+++ b/gunrock/app/sssp/sssp_app.cu
@@ -8,21 +8,19 @@
 /**
  * @file sssp_app.cu
  *
- * @brief single-source shortest path problem implementation
+ * @brief single-source shortest path (SSSP) application
  */
 
-#include <stdio.h>
 #include <gunrock/gunrock.h>
 
-// Graph construction utils
+// graph construction utilities
 #include <gunrock/graphio/market.cuh>
 
-// SSSP includes
+// single-source shortest path includes
 #include <gunrock/app/sssp/sssp_enactor.cuh>
 #include <gunrock/app/sssp/sssp_problem.cuh>
 #include <gunrock/app/sssp/sssp_functor.cuh>
 
-// Moderngpu include
 #include <moderngpu.cuh>
 
 using namespace gunrock;
@@ -38,7 +36,7 @@ using namespace gunrock::app::sssp;
  * @tparam SizeT
  * @tparam MARK_PREDECESSORS
  *
- * @param[out] ggraph_out GunrockGraph type output
+ * @param[out] graph_o GRGraph type output
  * @param[out] predecessor return predeessor if mark_pred = true
  * @param[in]  graph Reference to the CSR graph we process on
  * @param[in]  source Source node where SSSP starts
@@ -48,125 +46,164 @@ using namespace gunrock::app::sssp;
  * @param[in]  delta_factor user set
  * @param[in]  context moderngpu context
  */
-template <
-    typename VertexId,
-    typename Value,
-    typename SizeT,
-    bool MARK_PREDECESSORS >
+template<typename VertexId, typename Value, typename SizeT, 
+         bool MARK_PREDECESSORS>
 void run_sssp(
-    GunrockGraph   *ggraph_out,
+    GRGraph        *graph_o,
     VertexId       *predecessor,
-    const Csr<VertexId, Value, SizeT> &graph,
-    const VertexId source,
+    const Csr<VertexId, Value, SizeT> &csr,
+    const VertexId src,
     const int      max_grid_size,
     const float    queue_sizing,
     const int      num_gpus,
     const int      delta_factor,
-    CudaContext& context) {
-    // Preparations
-    typedef SSSPProblem <
-        VertexId,
-        SizeT,
-        Value,
-        MARK_PREDECESSORS > Problem;
-
+    CudaContext    &context) {
+    typedef SSSPProblem<VertexId, SizeT, Value, MARK_PREDECESSORS> Problem;
     // Allocate host-side label array for gpu-computed results
-    unsigned int *h_labels
-        = (unsigned int*)malloc(sizeof(unsigned int) * graph.nodes);
+    Value *h_labels = (Value*)malloc(sizeof(Value) * csr.nodes);
     //VertexId     *h_preds  = NULL;
 
     if (MARK_PREDECESSORS) {
-        //h_preds = (VertexId*)malloc(sizeof(VertexId) * graph.nodes);
+        //h_preds = (VertexId*)malloc(sizeof(VertexId) * csr.nodes);
     }
 
-    // Allocate SSSP enactor map
-    SSSPEnactor<false> sssp_enactor(false);
-
-    // Allocate problem on GPU
-    Problem *csr_problem = new Problem;
-    util::GRError(csr_problem->Init(
-                      false,
-                      graph,
-                      num_gpus,
-                      delta_factor),
-                  "Problem SSSP Initialization Failed", __FILE__, __LINE__);
+    SSSPEnactor<false> enactor(false);  // enactor map
+    Problem *problem = new Problem;
+    util::GRError(problem->Init(false, csr, num_gpus, delta_factor),
+                  "SSSP Problem Initialization Failed", __FILE__, __LINE__);
 
-    // Perform SSSP
-    CpuTimer gpu_timer;
-
-    util::GRError(csr_problem->Reset(
-                      source, sssp_enactor.GetFrontierType(), queue_sizing),
+    util::GRError(problem->Reset(src, enactor.GetFrontierType(), queue_sizing),
                   "SSSP Problem Data Reset Failed", __FILE__, __LINE__);
-    gpu_timer.Start();
-    util::GRError(sssp_enactor.template Enact<Problem>(
-                      context, csr_problem, source,
-                      queue_sizing, max_grid_size),
+
+    util::GRError(enactor.template Enact<Problem>(
+                      context, problem, src, queue_sizing, max_grid_size),
                   "SSSP Problem Enact Failed", __FILE__, __LINE__);
-    gpu_timer.Stop();
-    float elapsed = gpu_timer.ElapsedMillis();
 
-    // Copy out results
-    util::GRError(csr_problem->Extract(h_labels, predecessor),
+    util::GRError(problem->Extract(h_labels, predecessor),
                   "SSSP Problem Data Extraction Failed", __FILE__, __LINE__);
 
-    // copy label_values per node to GunrockGraph output
-    ggraph_out->node_values = (unsigned int*)&h_labels[0];
-
-    if (csr_problem) delete csr_problem;
-    //if (h_labels)    free(h_labels);
-    //if (h_preds)     free(h_preds);
+    // copy label_values per node to GRGraph output
+    graph_o->node_values = (Value*)&h_labels[0];
 
+    if (problem) { delete problem; }
     cudaDeviceSynchronize();
 }
 
 /**
  * @brief dispatch function to handle data_types
  *
- * @param[out] ggraph_out  GunrockGraph type output
- * @param[out] predecessor return predeessor if mark_pred = true
- * @param[in]  ggraph_in   GunrockGraph type input graph
- * @param[in]  sssp_config sssp specific configurations
- * @param[in]  data_type   sssp data_type configurations
- * @param[in]  context     moderngpu context
+ * @param[out] graph_o     GRGraph type output
+ * @param[out] predecessor Return predeessor if mark_pred = true
+ * @param[in]  graph_i     GRGraph type input graph
+ * @param[in]  config      Primitive-specific configurations
+ * @param[in]  data_t      Data type configurations
+ * @param[in]  context     ModernGPU context
  */
 void dispatch_sssp(
-    GunrockGraph          *ggraph_out,
-    void                  *predecessor,
-    const GunrockGraph    *ggraph_in,
-    const GunrockConfig   sssp_config,
-    const GunrockDataType data_type,
-    CudaContext&          context) {
-    switch (data_type.VTXID_TYPE) {
+    GRGraph       *graph_o,
+    void          *predecessor,
+    const GRGraph *graph_i,
+    const GRSetup config,
+    const GRTypes data_t,
+    CudaContext   &context) {
+    switch (data_t.VTXID_TYPE) {
     case VTXID_INT: {
-        switch (data_type.SIZET_TYPE) {
+        switch (data_t.SIZET_TYPE) {
         case SIZET_INT: {
-            switch (data_type.VALUE_TYPE) {
-            case VALUE_INT: {
-                // template type = <int, int, int>
-                // not support yet
-                printf("Not Yet Support This DataType Combination.\n");
+            switch (data_t.VALUE_TYPE) {
+            case VALUE_INT: {  // template type = <int, int, int>
+                Csr<int, int, int> csr_graph(false);
+                csr_graph.nodes          = graph_i->num_nodes;
+                csr_graph.edges          = graph_i->num_edges;
+                csr_graph.row_offsets    = (int*)graph_i->row_offsets;
+                csr_graph.column_indices = (int*)graph_i->col_indices;
+                csr_graph.edge_values    = (int*)graph_i->edge_values;
+
+                // sssp configurations
+                bool  mark_pred        =   0;  // whether to mark predecessors
+                int   src_node         =   0;  // source vertex to start
+                int   num_gpus         =   1;  // number of GPUs
+                int   delta_factor     =   1;  // default delta_factor = 1
+                int   max_grid_size    =   0;  // leave it up to the enactor
+                float max_queue_sizing = 1.0;  // default maximum queue sizing
+
+                // determine source vertex to start sssp
+                switch (config.src_mode) {
+                case randomize: {
+                    src_node = graphio::RandomNode(csr_graph.nodes);
+                    break;
+                }
+                case largest_degree: {
+                    int max_deg = 0;
+                    src_node = csr_graph.GetNodeWithHighestDegree(max_deg);
+                    break;
+                }
+                case manually: {
+                    src_node = config.src_node;
+                    break;
+                }
+                default: {
+                    src_node = 0;
+                    break;
+                }
+                }
+                mark_pred        = config.mark_pred;
+                delta_factor     = config.delta_factor;
+                max_queue_sizing = config.queue_size;
+
+                switch (mark_pred) {
+                case true: {
+                    run_sssp<int, int, int, true>(
+                        graph_o,
+                        (int*)predecessor,
+                        csr_graph,
+                        src_node,
+                        max_grid_size,
+                        max_queue_sizing,
+                        num_gpus,
+                        delta_factor,
+                        context);
+                    break;
+                }
+                case false: {
+                    run_sssp<int, int, int, false>(
+                        graph_o,
+                        (int*)predecessor,
+                        csr_graph,
+                        src_node,
+                        max_grid_size,
+                        max_queue_sizing,
+                        num_gpus,
+                        delta_factor,
+                        context);
+                    break;
+                }
+                }
+                // reset for free memory
+                csr_graph.row_offsets    = NULL;
+                csr_graph.column_indices = NULL;
+                csr_graph.edge_values    = NULL;
                 break;
             }
-            case VALUE_UINT: {
-                // template type = <int, uint, int>
+            case VALUE_UINT: {  // template type = <int, uint, int>
                 // build input csr format graph
                 Csr<int, unsigned int, int> csr_graph(false);
-                csr_graph.nodes          = ggraph_in->num_nodes;
-                csr_graph.edges          = ggraph_in->num_edges;
-                csr_graph.row_offsets    = (int*)ggraph_in->row_offsets;
-                csr_graph.column_indices = (int*)ggraph_in->col_indices;
-                csr_graph.edge_values    = (unsigned int*)ggraph_in->edge_values;
+                csr_graph.nodes          = graph_i->num_nodes;
+                csr_graph.edges          = graph_i->num_edges;
+                csr_graph.row_offsets    = (int*)graph_i->row_offsets;
+                csr_graph.column_indices = (int*)graph_i->col_indices;
+                csr_graph.edge_values    = (unsigned int*)graph_i->edge_values;
 
                 // sssp configurations
-                bool  mark_pred        = false;
-                int   src_node         = 0; //!< use whatever the specified graph-type's default is
-                int   num_gpus         = 1; //!< number of GPUs for multi-gpu enactor to use
-                int   delta_factor     = 1; //!< default delta_factor = 1
-                int   max_grid_size    = 0; //!< maximum grid size (0: leave it up to the enactor)
-                float max_queue_sizing = 1.0; //!< default maximum queue sizing
+                bool  mark_pred        =   0;  // whether to mark predecessors
+                int   src_node         =   0;  // source vertex to start
+                int   num_gpus         =   1;  // number of GPUs
+                int   delta_factor     =   1;  // default delta_factor = 1
+                int   max_grid_size    =   0;  // leave it up to the enactor
+                float max_queue_sizing = 1.0;  // default maximum queue sizing
 
                 // determine source vertex to start sssp
-                switch (sssp_config.src_mode) {
+                switch (config.src_mode) {
                 case randomize: {
                     src_node = graphio::RandomNode(csr_graph.nodes);
                     break;
@@ -177,7 +214,7 @@ void dispatch_sssp(
                     break;
                 }
                 case manually: {
-                    src_node = sssp_config.src_node;
+                    src_node = config.src_node;
                     break;
                 }
                 default: {
@@ -185,14 +222,14 @@ void dispatch_sssp(
                     break;
                 }
                 }
-                mark_pred        = sssp_config.mark_pred;
-                delta_factor     = sssp_config.delta_factor;
-                max_queue_sizing = sssp_config.queue_size;
+                mark_pred        = config.mark_pred;
+                delta_factor     = config.delta_factor;
+                max_queue_sizing = config.queue_size;
 
                 switch (mark_pred) {
                 case true: {
                     run_sssp<int, unsigned int, int, true>(
-                        ggraph_out,
+                        graph_o,
                         (int*)predecessor,
                         csr_graph,
                         src_node,
@@ -205,7 +242,7 @@ void dispatch_sssp(
                 }
                 case false: {
                     run_sssp<int, unsigned int, int, false>(
-                        ggraph_out,
+                        graph_o,
                         (int*)predecessor,
                         csr_graph,
                         src_node,
@@ -245,32 +282,22 @@ void dispatch_sssp(
  * @tparam Value
  * @tparam SizeT
  *
- * @param[out] ggraph_out  GunrockGraph type output
- * @param[out] predecessor return predeessor if mark_pred = true
- * @param[in]  ggraph_in   GunrockGraph type input graph
- * @param[in]  sssp_config gunrock primitive specific configurations
- * @param[in]  data_type   data_type configurations
+ * @param[out] graph_o     GRGraph type output
+ * @param[out] predecessor Return predeessor if mark_pred = true
+ * @param[in]  graph_i     GRGraph type input graph
+ * @param[in]  config      Primitive specific configurations
+ * @param[in]  data_t      Data type configurations
  */
-void gunrock_sssp_func(
-    GunrockGraph          *ggraph_out,
-    void                  *predecessor,
-    const GunrockGraph    *ggraph_in,
-    const GunrockConfig   sssp_config,
-    const GunrockDataType data_type) {
-
-    // moderngpu preparations
-    int device = 0;
-    device = sssp_config.device;
+void gunrock_sssp(
+    GRGraph       *graph_o,
+    void          *predecessor,
+    const GRGraph *graph_i,
+    const GRSetup config,
+    const GRTypes data_t) {
+    unsigned int device = 0;
+    device = config.device;
     ContextPtr context = mgpu::CreateCudaDevice(device);
-
-    // lunch dispatch function
-    dispatch_sssp(
-        ggraph_out,
-        predecessor,
-        ggraph_in,
-        sssp_config,
-        data_type,
-        *context);
+    dispatch_sssp(graph_o, predecessor, graph_i, config, data_t, *context);
 }
 
 // Leave this at the end of the file
diff --git a/gunrock/app/topk/topk_app.cu b/gunrock/app/topk/topk_app.cu
index 5b2855259..0e38c2fcf 100644
--- a/gunrock/app/topk/topk_app.cu
+++ b/gunrock/app/topk/topk_app.cu
@@ -1,20 +1,16 @@
-// ----------------------------------------------------------------
+// ----------------------------------------------------------------------------
 // Gunrock -- Fast and Efficient GPU Graph Library
-// ----------------------------------------------------------------
+// ----------------------------------------------------------------------------
 // This source code is distributed under the terms of LICENSE.TXT
 // in the root directory of this source distribution.
-// ----------------------------------------------------------------
+// ----------------------------------------------------------------------------
 
 /**
  * @file topk_app.cu
  *
- * @brief top k degree centralities implementation
+ * @brief top k degree centralities application
  */
 
-#include <cstdlib>
-#include <stdio.h>
-#include <vector>
-#include <iostream>
 #include <gunrock/gunrock.h>
 #include <gunrock/graphio/market.cuh>
 #include <gunrock/app/topk/topk_enactor.cuh>
@@ -77,7 +73,7 @@ template <
     typename Value,
     typename SizeT >
 void build_topk_subgraph(
-    GunrockGraph *subgraph,
+    GRGraph *subgraph,
     const Csr<VertexId, Value, SizeT> &graph_original,
     const Csr<VertexId, Value, SizeT> &graph_reversed,
     VertexId  *node_ids,
@@ -173,49 +169,32 @@ template <
     typename Value,
     typename SizeT >
 void run_topk(
-    GunrockGraph *graph_out,
+    GRGraph *graph_out,
     VertexId     *node_ids,
     Value        *in_degrees,
     Value        *out_degrees,
     const Csr<VertexId, Value, SizeT> &graph_original,
     const Csr<VertexId, Value, SizeT> &graph_reversed,
     SizeT        top_nodes) {
-    // preparations
     typedef TOPKProblem<VertexId, SizeT, Value> Problem;
-    TOPKEnactor<false> topk_enactor(false);
-    Problem *topk_problem = new Problem;
-
-    // reset top_nodes if necessary
+    TOPKEnactor<false> enactor(false);
+    Problem *problem = new Problem;
     top_nodes =
         (top_nodes > graph_original.nodes) ? graph_original.nodes : top_nodes;
 
-    // initialization
-    util::GRError(topk_problem->Init(
-                      false,
-                      graph_original,
-                      graph_reversed,
-                      1),
+    util::GRError(problem->Init(false, graph_original, graph_reversed, 1),
                   "Problem TOPK Initialization Failed", __FILE__, __LINE__);
 
-    // reset data slices
-    util::GRError(topk_problem->Reset(topk_enactor.GetFrontierType()),
+    util::GRError(problem->Reset(enactor.GetFrontierType()),
                   "TOPK Problem Data Reset Failed", __FILE__, __LINE__);
 
-    // launch gpu topk enactor to calculate top k nodes
-    util::GRError(topk_enactor.template Enact<Problem>(
-                      topk_problem,
-                      top_nodes),
+    util::GRError(enactor.template Enact<Problem>(problem, top_nodes),
                   "TOPK Problem Enact Failed", __FILE__, __LINE__);
 
-    // copy out results back to cpu
-    util::GRError(topk_problem->Extract(
-                      node_ids,
-                      in_degrees,
-                      out_degrees,
-                      top_nodes),
+    util::GRError(problem->Extract(node_ids, in_degrees, out_degrees, top_nodes),
                   "TOPK Problem Data Extraction Failed", __FILE__, __LINE__);
 
-    // build a subgraph contains only top k nodes on cpu
+    // build vertex-induced subgraph contains only top k nodes
     build_topk_subgraph<VertexId, Value, SizeT>(
         graph_out,
         graph_original,
@@ -223,62 +202,54 @@ void run_topk(
         (int*)node_ids,
         top_nodes);
 
-    // cleanup if neccessary
-    if (topk_problem) { delete topk_problem; }
-
+    if (problem) { delete problem; }
     cudaDeviceSynchronize();
 }
 
 /**
  * @brief dispatch function to handle data_types
  *
- * @param[out] ggraph_out  GunrockGraph type output
+ * @param[out] graph_o     GRGraph type output
  * @param[out] node_ids    output top k node ids
  * @param[out] in_degrees  output top k in-degree centralities
  * @param[out] out_degrees output top k out-degree centralities
- * @param[in]  ggraph_in   GunrockGraph type input graph
- * @param[in]  topk_config topk specific configurations
- * @param[in]  data_type   topk data_type configurations
+ * @param[in]  graph_i     GRGraph type input graph
+ * @param[in]  config      topk specific configurations
+ * @param[in]  data_t      topk data_t configurations
  */
 void dispatch_topk(
-    GunrockGraph          *ggraph_out,
-    void                  *node_ids,
-    void                  *in_degrees,
-    void                  *out_degrees,
-    const GunrockGraph    *ggraph_in,
-    const GunrockConfig   topk_config,
-    const GunrockDataType data_type) {
-    switch (data_type.VTXID_TYPE) {
+    GRGraph       *graph_o,
+    void          *node_ids,
+    void          *in_degrees,
+    void          *out_degrees,
+    const GRGraph *graph_i,
+    const GRSetup  config,
+    const GRTypes  data_t) {
+    switch (data_t.VTXID_TYPE) {
     case VTXID_INT: {
-        switch (data_type.SIZET_TYPE) {
+        switch (data_t.SIZET_TYPE) {
         case SIZET_INT: {
-            switch (data_type.VALUE_TYPE) {
-            case VALUE_INT: {
-                // template type = <int, int, int>
-                // original graph
+            switch (data_t.VALUE_TYPE) {
+            case VALUE_INT: {  // template type = <int, int, int>
                 Csr<int, int, int> graph_original(false);
-                graph_original.nodes = ggraph_in->num_nodes;
-                graph_original.edges = ggraph_in->num_edges;
-                graph_original.row_offsets    = (int*)ggraph_in->row_offsets;
-                graph_original.column_indices = (int*)ggraph_in->col_indices;
-
-                // reversed graph
+                graph_original.nodes = graph_i->num_nodes;
+                graph_original.edges = graph_i->num_edges;
+                graph_original.row_offsets    = (int*)graph_i->row_offsets;
+                graph_original.column_indices = (int*)graph_i->col_indices;
                 Csr<int, int, int> graph_reversed(false);
-                graph_reversed.nodes = ggraph_in->num_nodes;
-                graph_reversed.edges = ggraph_in->num_edges;
-                graph_reversed.row_offsets    = (int*)ggraph_in->col_offsets;
-                graph_reversed.column_indices = (int*)ggraph_in->row_indices;
-
-                //graph_original.DisplayGraph();
+                graph_reversed.nodes = graph_i->num_nodes;
+                graph_reversed.edges = graph_i->num_edges;
+                graph_reversed.row_offsets    = (int*)graph_i->col_offsets;
+                graph_reversed.column_indices = (int*)graph_i->row_indices;
 
                 run_topk<int, int, int>(
-                    ggraph_out,
+                    graph_o,
                     (int*)node_ids,
                     (int*)in_degrees,
                     (int*)out_degrees,
                     graph_original,
                     graph_reversed,
-                    topk_config.top_nodes);
+                    config.top_nodes);
 
                 // reset for free memory
                 graph_original.row_offsets    = NULL;
@@ -287,13 +258,11 @@ void dispatch_topk(
                 graph_reversed.column_indices = NULL;
                 break;
             }
-            case VALUE_UINT: {
-                // template type = <int, uint, int>
+            case VALUE_UINT: {  // template type = <int, uint, int>
                 printf("Not Yet Support This DataType Combination.\n");
                 break;
             }
-            case VALUE_FLOAT: {
-                // template type = <int, float, int>
+            case VALUE_FLOAT: {  // template type = <int, float, int>
                 printf("Not Yet Support This DataType Combination.\n");
                 break;
             }
@@ -309,32 +278,24 @@ void dispatch_topk(
 /*
  * @brief topk dispatch function base on gunrock data types
  *
- * @param[out] ggraph_out  output subgraph of topk problem
+ * @param[out] graph_o     output subgraph of topk problem
  * @param[out] node_ids    output top k node_ids
  * @param[out] in_degrees  output associated centrality values
  * @param[out] out_degrees output associated centrality values
- * @param[in]  ggraph_in   input graph need to process on
- * @param[in]  topk_config gunrock primitive specific configurations
- * @param[in]  data_type   gunrock datatype struct
+ * @param[in]  graph_i     input graph need to process on
+ * @param[in]  config      gunrock primitive specific configurations
+ * @param[in]  data_t      gunrock data_t struct
  */
-void gunrock_topk_func(
-    GunrockGraph          *ggraph_out,
-    void                  *node_ids,
-    void                  *in_degrees,
-    void                  *out_degrees,
-    const GunrockGraph    *ggraph_in,
-    const GunrockConfig   topk_config,
-    const GunrockDataType data_type) {
-
-    // launch topk dispatch function
-    dispatch_topk(
-        ggraph_out,
-        node_ids,
-        in_degrees,
-        out_degrees,
-        ggraph_in,
-        topk_config,
-        data_type);
+void gunrock_topk(
+    GRGraph       *graph_o,
+    void          *node_ids,
+    void          *in_degrees,
+    void          *out_degrees,
+    const GRGraph *graph_i,
+    const GRSetup  config,
+    const GRTypes  data_t) {
+    dispatch_topk(graph_o, node_ids, in_degrees, out_degrees,
+                  graph_i, config, data_t);
 }
 
 // Leave this at the end of the file
diff --git a/gunrock/gunrock.h b/gunrock/gunrock.h
index 96f1ddf41..441d77a85 100644
--- a/gunrock/gunrock.h
+++ b/gunrock/gunrock.h
@@ -21,136 +21,136 @@
 /**
  * @brief VertexId data type enumerators.
  */
-enum VertexIdType {
-    VTXID_INT, //!< integer type
+enum VtxIdType {
+    VTXID_INT,  // integer type
 };
 
 /**
  * @brief SizeT data type enumerators.
  */
 enum SizeTType {
-    SIZET_INT, //!< unsigned integer type
+    SIZET_INT,  // unsigned integer type
 };
 
 /**
  * @brief Value data type enumerators.
  */
 enum ValueType {
-    VALUE_INT,   //!< integer type
-    VALUE_UINT,  //!< unsigned int type
-    VALUE_FLOAT, //!< float type
+    VALUE_INT,    // integer type
+    VALUE_UINT,   // unsigned int type
+    VALUE_FLOAT,  // float type
 };
 
 /**
  * @brief data-type configuration used to specify data types
  */
-struct GunrockDataType {
-    enum VertexIdType VTXID_TYPE; //!< VertexId data-type
-    enum SizeTType    SIZET_TYPE; //!< SizeT    data-type
-    enum ValueType    VALUE_TYPE; //!< Value    data-type
+struct GRTypes {
+    enum VtxIdType VTXID_TYPE;  // VertexId data type
+    enum SizeTType SIZET_TYPE;  // SizeT data type
+    enum ValueType VALUE_TYPE;  // Value data type
 };
 
 /**
  * @brief GunrockGraph as a standard graph interface
  */
-struct GunrockGraph {
-    size_t num_nodes;    //!< number of nodes in graph
-    size_t num_edges;    //!< number of edges in graph
-    void   *row_offsets; //!< C.S.R. row offsets
-    void   *col_indices; //!< C.S.R. column indices
-    void   *col_offsets; //!< C.S.C. column offsets
-    void   *row_indices; //!< C.S.C. row indices
-    void   *node_values; //!< associated values per node
-    void   *edge_values; //!< associated values per edge
+struct GRGraph {
+    size_t num_nodes;     // number of nodes in graph
+    size_t num_edges;     // number of edges in graph
+    void   *row_offsets;  // CSR row offsets
+    void   *col_indices;  // CSR column indices
+    void   *col_offsets;  // CSC column offsets
+    void   *row_indices;  // CSC row indices
+    void   *node_values;  // associated values per node
+    void   *edge_values;  // associated values per edge
 };
 
 /**
  * @brief Source Vertex Mode enumerators.
  */
 enum SrcMode {
-    manually,       //!< manually set up source node
-    randomize,      //!< random generate source node
-    largest_degree, //!< set to largest-degree node
+    manually,        // manually set up source node
+    randomize,       // random generate source node
+    largest_degree,  // set to largest-degree node
 };
 
 /**
  * @brief arguments configuration used to specify arguments
  */
-struct GunrockConfig {
-    bool  mark_pred;        //!< whether to mark predecessor or not
-    bool  idempotence;      //!< whether or not to enable idempotent
-    int   src_node;         //!< source vertex define where to start
-    int   device;           //!< setting which gpu device to use
-    int   max_iter;         //!< maximum number of iterations allowed
-    int   top_nodes;        //!< k value for topk / page_rank problem
-    int   delta_factor;     //!< sssp delta-factor parameter
-    float delta;            //!< pagerank specific value
-    float error;            //!< pagerank specific value
-    float queue_size;       //!< setting frontier queue size
-    enum  SrcMode src_mode; //!< source mode rand/largest_degree
+struct GRSetup {
+    bool  mark_pred;         // whether to mark predecessor or not
+    bool  idempotence;       // whether or not to enable idempotent
+    int   src_node;          // source vertex define where to start
+    int   device;            // setting which device to use
+    int   max_iter;          // maximum number of iterations allowed
+    int   top_nodes;         // k value for top k / pagerank problem
+    int   delta_factor;      // sssp delta-factor parameter
+    float delta;             // pagerank specific value
+    float error;             // pagerank specific value
+    float queue_size;        // setting frontier queue size
+    enum  SrcMode src_mode;  // source mode rand/largest_degree
 };
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-// BFS Function Define
-void gunrock_bfs_func(
-    struct GunrockGraph       *graph_out,
-    const struct GunrockGraph *graph_in,
-    struct GunrockConfig      configs,
-    struct GunrockDataType    data_type);
-
-// BC Function Define
-void gunrock_bc_func(
-    struct GunrockGraph       *graph_out,
-    const struct GunrockGraph *graph_in,
-    struct GunrockConfig      configs,
-    struct GunrockDataType    data_type);
-
-// CC Function Define
-void gunrock_cc_func(
-    struct GunrockGraph       *graph_out,
-    unsigned int              *components,
-    const struct GunrockGraph *graph_in,
-    struct GunrockConfig      configs,
-    struct GunrockDataType    data_type);
-
-// SSSP Function Define
-void gunrock_sssp_func(
-    struct GunrockGraph       *graph_out,
-    void                      *predecessor,
-    const struct GunrockGraph *graph_in,
-    struct GunrockConfig      congis,
-    struct GunrockDataType    data_type);
-
-// PR Function Define
-void gunrock_pr_func(
-    struct GunrockGraph       *graph_out,
-    void                      *node_ids,
-    void                      *page_rank,
-    const struct GunrockGraph *graph_in,
-    struct GunrockConfig      configs,
-    struct GunrockDataType    data_type);
-
-// TopK Function Define
-void gunrock_topk_func(
-    struct GunrockGraph       *graph_out,
-    void                      *node_ids,
-    void                      *in_degrees,
-    void                      *out_degrees,
-    const struct GunrockGraph *graph_in,
-    struct GunrockConfig      configs,
-    struct GunrockDataType    data_type);
-
-// Minimum spanning tree
+// breath-first search
+void gunrock_bfs(
+    struct       GRGraph *graph_o,
+    const struct GRGraph *graph_i,
+    struct       GRSetup  config,
+    struct       GRTypes  data_t);
+
+// betweenness centrality
+void gunrock_bc(
+    struct       GRGraph *graph_o,
+    const struct GRGraph *graph_i,
+    struct       GRSetup  config,
+    struct       GRTypes  data_t);
+
+// connected component
+void gunrock_cc(
+    struct       GRGraph *graph_o,
+    unsigned int         *components,
+    const struct GRGraph *graph_i,
+    struct       GRSetup  config,
+    struct       GRTypes  data_t);
+
+// single-source shortest path
+void gunrock_sssp(
+    struct       GRGraph *graph_o,
+    void                 *predecessor,
+    const struct GRGraph *graph_i,
+    struct       GRSetup  config,
+    struct       GRTypes  data_t);
+
+// page-rank
+void gunrock_pagerank(
+    struct       GRGraph *graph_o,
+    void                 *node_ids,
+    void                 *pagerank,
+    const struct GRGraph *graph_i,
+    struct       GRSetup  config,
+    struct       GRTypes  data_t);
+
+// degree centrality
+void gunrock_topk(
+    struct       GRGraph *graph_o,
+    void                 *node_ids,
+    void                 *in_degrees,
+    void                 *out_degrees,
+    const struct GRGraph *graph_i,
+    struct       GRSetup  config,
+    struct       GRTypes  data_t);
+
+// minimum spanning tree
 void gunrock_mst(
-    struct GunrockGraph       *graph_out,
-    const struct GunrockGraph *graph_in,
-    struct GunrockConfig      configs,
-    struct GunrockDataType    data_type);
+    struct       GRGraph *graph_o,
+    const struct GRGraph *graph_i,
+    struct       GRSetup config,
+    struct       GRTypes data_t);
 
-// TODO: Add other algorithms
+// TODO(ydwu): Add other primitives
 
 #ifdef __cplusplus
 }
diff --git a/shared_lib_tests/test_bc.c b/shared_lib_tests/test_bc.c
index 0eb4fdf0f..177585a58 100644
--- a/shared_lib_tests/test_bc.c
+++ b/shared_lib_tests/test_bc.c
@@ -1,76 +1,65 @@
 /**
  * @brief BC test for shared library
  * @file test_bc.c
- *
- * set input graph, configs and call function gunrock_bc_func
- * return per node label values in graph_out node_values
  */
 
 #include <stdio.h>
 #include <gunrock/gunrock.h>
 
-int main(int argc, char* argv[])
-{
-  // define data types
-  struct GunrockDataType data_type;
-  data_type.VTXID_TYPE = VTXID_INT;
-  data_type.SIZET_TYPE = SIZET_INT;
-  data_type.VALUE_TYPE = VALUE_FLOAT;
+int main(int argc, char* argv[]) {
+    // define data types
+    struct GRTypes data_t;
+    data_t.VTXID_TYPE = VTXID_INT;
+    data_t.SIZET_TYPE = SIZET_INT;
+    data_t.VALUE_TYPE = VALUE_FLOAT;
 
-  // bc configurations (optional)
-  struct GunrockConfig bc_config;
-  bc_config.device       =    0;
-  bc_config.src_node     =   -1;     //!< source vertex to begin search
-  bc_config.queue_size   = 1.0f;
-  bc_config.src_mode = manually;
+    // bc configurations (optional)
+    struct GRSetup config;
+    config.device     =    0;
+    config.src_node   =   -1;  // source vertex to begin search
+    config.queue_size = 1.0f;
+    config.src_mode   = manually;
 
-  // define graph (undirected graph)
-  size_t num_nodes = 7;
-  size_t num_edges = 26;
-  int row_offsets[8] = {0, 3, 6, 11, 15, 19, 23, 26};
-  int col_indices[26] = {1, 2, 3, 0, 2, 4, 0, 1, 3, 4, 5, 0, 2,
-                         5, 6, 1, 2, 5, 6, 2, 3, 4, 6, 3, 4, 5};
+    // define graph (undirected graph)
+    size_t num_nodes    = 7;
+    size_t num_edges    = 26;
+    int row_offsets[8]  = {0, 3, 6, 11, 15, 19, 23, 26};
+    int col_indices[26] = {1, 2, 3, 0, 2, 4, 0, 1, 3, 4, 5, 0, 2,
+                           5, 6, 1, 2, 5, 6, 2, 3, 4, 6, 3, 4, 5};
 
-  // build graph as input
-  struct GunrockGraph *graph_input =
-    (struct GunrockGraph*)malloc(sizeof(struct GunrockGraph));
-  graph_input->num_nodes   = num_nodes;
-  graph_input->num_edges   = num_edges;
-  graph_input->row_offsets = (void*)&row_offsets[0];
-  graph_input->col_indices = (void*)&col_indices[0];
+    // build graph as input
+    struct GRGraph *graph_i = (struct GRGraph*)malloc(sizeof(struct GRGraph));
+    graph_i->num_nodes   = num_nodes;
+    graph_i->num_edges   = num_edges;
+    graph_i->row_offsets = (void*)&row_offsets[0];
+    graph_i->col_indices = (void*)&col_indices[0];
 
-  // malloc output graph
-  struct GunrockGraph *graph_output =
-    (struct GunrockGraph*)malloc(sizeof(struct GunrockGraph));
+    // malloc output graph
+    struct GRGraph *graph_o = (struct GRGraph*)malloc(sizeof(struct GRGraph));
 
-  // run bc calculations
-  gunrock_bc_func(
-    graph_output,
-    graph_input,
-    bc_config,
-    data_type);
+    // run bc calculations
+    gunrock_bc(graph_o, graph_i, config, data_t);
 
-  // test print
-  int i;
-  printf("Demo Outputs:\n");
-  // print per node betweeness centrality values
-  float *bc_vals = (float*)malloc(sizeof(float) * graph_input->num_nodes);
-  bc_vals = (float*)graph_output->node_values;
-  for (i = 0; i < graph_input->num_nodes; ++i)
-  {
-    printf("Node_ID [%d] : BC[%f]\n", i, bc_vals[i]);
-  }
-  printf("\n");
-  // print per edge betweeness centrality values
-  float *ebc_vals = (float*)malloc(sizeof(float)*graph_input->num_edges);
-  ebc_vals = (float*)graph_output->edge_values;
-  for (i = 0; i < graph_input->num_edges; ++i)
-  {
-    printf("Edge_ID [%d] : EBC[%f]\n", i, ebc_vals[i]);
-  }
+    // test print
+    int i;
+    printf("Demo Outputs:\n");
+    // print per node betweeness centrality values
+    float *bc_vals = (float*)malloc(sizeof(float) * graph_i->num_nodes);
+    bc_vals = (float*)graph_o->node_values;
+    for (i = 0; i < graph_i->num_nodes; ++i) {
+        printf("Node_ID [%d] : BC[%f]\n", i, bc_vals[i]);
+    }
+    printf("\n");
+    // print per edge betweeness centrality values
+    float *ebc_vals = (float*)malloc(sizeof(float) * graph_i->num_edges);
+    ebc_vals = (float*)graph_o->edge_values;
+    for (i = 0; i < graph_i->num_edges; ++i) {
+        printf("Edge_ID [%d] : EBC[%f]\n", i, ebc_vals[i]);
+    }
 
-  if (graph_input)  { free(graph_input);  }
-  if (graph_output) { free(graph_output); }
+    // clean up
+    if (graph_i) { free(graph_i); }
+    if (graph_o) { free(graph_o); }
 
-  return 0;
+    return 0;
 }
diff --git a/shared_lib_tests/test_bfs.c b/shared_lib_tests/test_bfs.c
index d3f57b747..11b43b2a5 100644
--- a/shared_lib_tests/test_bfs.c
+++ b/shared_lib_tests/test_bfs.c
@@ -1,69 +1,59 @@
 /**
  * @brief BFS test for shared library
  * @file test_bfs.c
- *
- * set input graph, configs and call function gunrock_bfs_func
- * return per node label values in graph_out node_values
  */
 
 #include <stdio.h>
 #include <gunrock/gunrock.h>
 
-int main(int argc, char* argv[])
-{
-  // define data types
-  struct GunrockDataType data_type;
-  data_type.VTXID_TYPE = VTXID_INT;
-  data_type.SIZET_TYPE = SIZET_INT;
-  data_type.VALUE_TYPE = VALUE_INT;
-
-  // bfs configurations (optional)
-  struct GunrockConfig bfs_config;
-  bfs_config.device      = 0;
-  bfs_config.src_mode    = randomize;
-  bfs_config.src_node    = 1;     //!< source vertex to begin search
-  bfs_config.mark_pred   = false; //!< do not mark predecessors
-  bfs_config.idempotence = false; //!< wether enable idempotence
-  bfs_config.queue_size  = 1.0f;
-
-  // define graph
-  size_t num_nodes = 7;
-  size_t num_edges = 15;
-  int row_offsets[8] = {0,3,6,9,11,14,15,15};
-  int col_indices[15] = {1,2,3,0,2,4,3,4,5,5,6,2,5,6,6};
-
-  // build graph as input
-  struct GunrockGraph *graph_input =
-    (struct GunrockGraph*)malloc(sizeof(struct GunrockGraph));
-  graph_input->num_nodes   = num_nodes;
-  graph_input->num_edges   = num_edges;
-  graph_input->row_offsets = (void*)&row_offsets[0];
-  graph_input->col_indices = (void*)&col_indices[0];
-
-  // malloc output graph
-  struct GunrockGraph *graph_output =
-    (struct GunrockGraph*)malloc(sizeof(struct GunrockGraph));
-
-  // run bfs calculations
-  gunrock_bfs_func(
-    graph_output,
-    graph_input,
-    bfs_config,
-    data_type);
-
-  // test print
-  int i;
-  printf("Demo Outputs:\n");
-  int *labels = (int*)malloc(sizeof(int) * graph_input->num_nodes);
-  labels = (int*)graph_output->node_values;
-  for (i = 0; i < graph_input->num_nodes; ++i)
-  {
-    printf("Node_ID [%d] : Label [%d]\n", i, labels[i]);
-  }
-
-  if (graph_input)  { free(graph_input);  }
-  if (graph_output) { free(graph_output); }
-  if (labels)       { free(labels);       }
-
-  return 0;
+int main(int argc, char* argv[]) {
+    // define data types
+    struct GRTypes data_t;
+    data_t.VTXID_TYPE = VTXID_INT;
+    data_t.SIZET_TYPE = SIZET_INT;
+    data_t.VALUE_TYPE = VALUE_INT;
+
+    // bfs configurations (optional)
+    struct GRSetup config;
+    config.device      = 0;
+    config.src_mode    = randomize;
+    config.src_node    = 1;      // source vertex to begin search
+    config.mark_pred   = false;  // do not mark predecessors
+    config.idempotence = false;  // wether enable idempotence
+    config.queue_size  = 1.0f;
+
+    // define graph
+    size_t num_nodes    = 7;
+    size_t num_edges    = 15;
+    int row_offsets[8]  = {0, 3, 6, 9, 11, 14, 15, 15};
+    int col_indices[15] = {1, 2, 3, 0, 2, 4, 3, 4, 5, 5, 6, 2, 5, 6, 6};
+
+    // build graph as input
+    struct GRGraph *graph_i = (struct GRGraph*)malloc(sizeof(struct GRGraph));
+    graph_i->num_nodes   = num_nodes;
+    graph_i->num_edges   = num_edges;
+    graph_i->row_offsets = (void*)&row_offsets[0];
+    graph_i->col_indices = (void*)&col_indices[0];
+
+    // malloc output graph
+    struct GRGraph *graph_o = (struct GRGraph*)malloc(sizeof(struct GRGraph));
+
+    // run bfs calculations
+    gunrock_bfs(graph_o, graph_i, config, data_t);
+
+    // test print
+    int i;
+    printf("Demo Outputs:\n");
+    int *labels = (int*)malloc(sizeof(int) * graph_i->num_nodes);
+    labels = (int*)graph_o->node_values;
+    for (i = 0; i < graph_i->num_nodes; ++i) {
+        printf("Node_ID [%d] : Label [%d]\n", i, labels[i]);
+    }
+
+    // clean up
+    if (graph_i) { free(graph_i); }
+    if (graph_o) { free(graph_o); }
+    if (labels)  { free(labels);  }
+
+    return 0;
 }
diff --git a/shared_lib_tests/test_cc.c b/shared_lib_tests/test_cc.c
index a230619b9..0dbd67bc1 100644
--- a/shared_lib_tests/test_cc.c
+++ b/shared_lib_tests/test_cc.c
@@ -1,66 +1,55 @@
 /**
  * @brief CC test for shared library
  * @file test_cc.c
- *
- * set input graph, configs and call function gunrock_cc_func
- * return per node label values in graph_out node_values
  */
 
 #include <stdio.h>
 #include <gunrock/gunrock.h>
 
-int main(int argc, char* argv[])
-{
-  // define data types
-  struct GunrockDataType data_type;
-  data_type.VTXID_TYPE = VTXID_INT;
-  data_type.SIZET_TYPE = SIZET_INT;
-  data_type.VALUE_TYPE = VALUE_INT;
-
-  // connected component configurations
-  struct GunrockConfig configs;
-  configs.device = 0;
-
-  // define graph
-  size_t num_nodes = 7;
-  size_t num_edges = 15;
-  int row_offsets[8] = {0,3,6,9,11,14,15,15};
-  int col_indices[15] = {1,2,3,0,2,4,3,4,5,5,6,2,5,6,6};
-
-  // build graph as input
-  struct GunrockGraph *graph_input =
-    (struct GunrockGraph*)malloc(sizeof(struct GunrockGraph));
-  graph_input->num_nodes   = num_nodes;
-  graph_input->num_edges   = num_edges;
-  graph_input->row_offsets = (void*)&row_offsets[0];
-  graph_input->col_indices = (void*)&col_indices[0];
-
-  // malloc output graph
-  struct GunrockGraph *graph_output =
-    (struct GunrockGraph*)malloc(sizeof(struct GunrockGraph));
-  unsigned int *components = (unsigned int*)malloc(sizeof(unsigned int));
-
-  // run connected component calculations
-  gunrock_cc_func(
-    graph_output,
-    components,
-    graph_input,
-    configs,
-    data_type);
-
-  // test print
-  int i;
-  printf("Number of Components: %d\n", components[0]);
-  printf("Demo Outputs:\n");
-  int *component_ids = (int*)malloc(sizeof(int) * graph_input->num_nodes);
-  component_ids = (int*)graph_output->node_values;
-  for (i = 0; i < graph_input->num_nodes; ++i)
-  {
-    printf("Node_ID [%d] : Component_ID [%d]\n", i, component_ids[i]);
-  }
-
-  if (graph_input)  { free(graph_input);  }
-  if (graph_output) { free(graph_output); }
-
-  return 0;
+int main(int argc, char* argv[]) {
+    // define data types
+    struct GRTypes data_t;
+    data_t.VTXID_TYPE = VTXID_INT;
+    data_t.SIZET_TYPE = SIZET_INT;
+    data_t.VALUE_TYPE = VALUE_INT;
+
+    // connected component configurations
+    struct GRSetup config;
+    config.device = 0;
+
+    // define graph
+    size_t num_nodes    = 7;
+    size_t num_edges    = 15;
+    int row_offsets[8]  = {0, 3, 6, 9, 11, 14, 15, 15};
+    int col_indices[15] = {1, 2, 3, 0, 2, 4, 3, 4, 5, 5, 6, 2, 5, 6, 6};
+
+    // build graph as input
+    struct GRGraph *graph_i = (struct GRGraph*)malloc(sizeof(struct GRGraph));
+    graph_i->num_nodes   = num_nodes;
+    graph_i->num_edges   = num_edges;
+    graph_i->row_offsets = (void*)&row_offsets[0];
+    graph_i->col_indices = (void*)&col_indices[0];
+
+    // malloc output graph
+    struct GRGraph *graph_o = (struct GRGraph*)malloc(sizeof(struct GRGraph));
+    unsigned int *components = (unsigned int*)malloc(sizeof(unsigned int));
+
+    // run connected component calculations
+    gunrock_cc(graph_o, components, graph_i, config, data_t);
+
+    // demo test print
+    printf("Number of Components: %d\n", components[0]);
+    printf("Demo Outputs:\n");
+    int *component_ids = (int*)malloc(sizeof(int) * graph_i->num_nodes);
+    component_ids = (int*)graph_o->node_values;
+    int node;
+    for (node = 0; node < graph_i->num_nodes; ++node) {
+        printf("Node_ID [%d] : Component_ID [%d]\n", node, component_ids[node]);
+    }
+
+    // clean up
+    if (graph_i) { free(graph_i); }
+    if (graph_o) { free(graph_o); }
+
+    return 0;
 }
diff --git a/shared_lib_tests/test_mst.c b/shared_lib_tests/test_mst.c
index 47592a206..07fbdb11c 100644
--- a/shared_lib_tests/test_mst.c
+++ b/shared_lib_tests/test_mst.c
@@ -1,62 +1,57 @@
 /**
  * @brief MST test for shared library
  * @file test_mst.c
- *
- * set input graph, configs and call function gunrock_mst
- * return per node or per edge values in graph_out node_values
  */
 
 #include <stdio.h>
 #include <gunrock/gunrock.h>
 
-int main(int argc, char* argv[])
-{
-  // set problem data types
-  struct GunrockDataType dt;
-  dt.VTXID_TYPE = VTXID_INT;
-  dt.SIZET_TYPE = SIZET_INT;
-  dt.VALUE_TYPE = VALUE_INT;
-
-  // configurations (optional)
-  struct GunrockConfig configs;
-  configs.device = 0;
-
-  // tiny sample graph
-  size_t num_nodes = 7;
-  size_t num_edges = 26;
-  int row_offsets[8]  = {0, 3, 6, 11, 15, 19, 23, 26};
-  int col_indices[26] = {1, 2, 3, 0, 2, 4, 0, 1, 3, 4, 5, 0, 2,
-                         5, 6, 1, 2, 5, 6, 2, 3, 4, 6, 3, 4, 5};
-  int edge_values[26] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-                         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
-
-  // build graph as input
-  struct GunrockGraph *graph_input =
-    (struct GunrockGraph*)malloc(sizeof(struct GunrockGraph));
-  graph_input->num_nodes   = num_nodes;
-  graph_input->num_edges   = num_edges;
-  graph_input->row_offsets = (void*)&row_offsets[0];
-  graph_input->col_indices = (void*)&col_indices[0];
-  graph_input->edge_values = (void*)&edge_values[0];
-
-  // malloc output graph
-  struct GunrockGraph *graph_output =
-    (struct GunrockGraph*)malloc(sizeof(struct GunrockGraph));
-
-  // call MST
-  gunrock_mst(graph_output, graph_input, configs, dt);
-
-  // demo test print
-  printf("Demo Outputs:\n");
-  int *mst_mask = (int*)malloc(sizeof(int) * num_edges);
-  mst_mask = (int*)graph_output->edge_values;
-  int edge;
-  for (edge = 0; edge < num_edges; ++edge) {
-    printf("Edge ID [%d] : Label [%d]\n", edge, mst_mask[edge]);
-  }
-
-  if (graph_input)  { free(graph_input);  }
-  if (graph_output) { free(graph_output); }
-
-  return 0;
+int main(int argc, char* argv[]) {
+    // set problem data types
+    struct GRTypes data_t;
+    data_t.VTXID_TYPE = VTXID_INT;
+    data_t.VALUE_TYPE = VALUE_INT;
+    data_t.SIZET_TYPE = SIZET_INT;
+
+    // configurations (optional)
+    struct GRSetup config;
+    config.device = 0;
+
+    // tiny sample graph
+    size_t num_nodes = 7;
+    size_t num_edges = 26;
+    int row_offsets[8]  = {0, 3, 6, 11, 15, 19, 23, 26};
+    int col_indices[26] = {1, 2, 3, 0, 2, 4, 0, 1, 3, 4, 5, 0, 2,
+                           5, 6, 1, 2, 5, 6, 2, 3, 4, 6, 3, 4, 5};
+    int edge_values[26] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                           1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
+
+    // build an graph as input
+    struct GRGraph *graph_i = (struct GRGraph*)malloc(sizeof(struct GRGraph));
+    graph_i->num_nodes   = num_nodes;
+    graph_i->num_edges   = num_edges;
+    graph_i->row_offsets = (void*)&row_offsets[0];
+    graph_i->col_indices = (void*)&col_indices[0];
+    graph_i->edge_values = (void*)&edge_values[0];
+
+    // malloc output graph
+    struct GRGraph *graph_o = (struct GRGraph*)malloc(sizeof(struct GRGraph));
+
+    // call minimum spanning tree
+    gunrock_mst(graph_o, graph_i, config, data_t);
+
+    // demo test print
+    printf("Demo Outputs:\n");
+    int *mst_mask = (int*)malloc(sizeof(int) * num_edges);
+    mst_mask = (int*)graph_o->edge_values;
+    int edge;
+    for (edge = 0; edge < num_edges; ++edge) {
+        printf("Edge ID [%d] : Mask [%d]\n", edge, mst_mask[edge]);
+    }
+
+    // clean up
+    if (graph_i) { free(graph_i); }
+    if (graph_o) { free(graph_o); }
+
+    return 0;
 }
diff --git a/shared_lib_tests/test_pr.c b/shared_lib_tests/test_pr.c
index 0b8ceae0c..cb36b4df1 100644
--- a/shared_lib_tests/test_pr.c
+++ b/shared_lib_tests/test_pr.c
@@ -1,74 +1,62 @@
 /**
  * @brief PR test for shared library
  * @file test_pr.c
- *
- * set input graph, configs and call function gunrock_pr_func
- * return per node or per edge values in graph_out node_values
  */
 
 #include <stdio.h>
 #include <gunrock/gunrock.h>
 
-int main(int argc, char* argv[])
-{
-  // define data types
-  struct GunrockDataType data_type;
-  data_type.VTXID_TYPE = VTXID_INT;   //!< integer type vertex_ids
-  data_type.SIZET_TYPE = SIZET_INT;   //!< integer type graph size
-  data_type.VALUE_TYPE = VALUE_FLOAT; //!< float type value for pr
-
-  // pr configurations (optional)
-  struct GunrockConfig pr_config;
-  pr_config.device    =     0; //!< use device 0
-  pr_config.delta     = 0.85f; //!< default delta value
-  pr_config.error     = 0.01f; //!< default error threshold
-  pr_config.max_iter  =    20; //!< maximum number of iterations
-  pr_config.top_nodes =    10; //!< number of top nodes
-  pr_config.src_node  =     0; //!< source node to begin page rank
-  pr_config.src_mode  = manually; //!< set source node manually
-
-  // define graph (undirected graph)
-  size_t num_nodes = 7;
-  size_t num_edges = 15;
-  int row_offsets[8] = {0,3,6,9,11,14,15,15};
-  int col_indices[15] = {1,2,3,0,2,4,3,4,5,5,6,2,5,6,6};
-
-  // build graph as input
-  struct GunrockGraph *graph_input =
-    (struct GunrockGraph*)malloc(sizeof(struct GunrockGraph));
-  graph_input->num_nodes   = num_nodes;
-  graph_input->num_edges   = num_edges;
-  graph_input->row_offsets = (void*)&row_offsets[0];
-  graph_input->col_indices = (void*)&col_indices[0];
-
-  // malloc output graph
-  struct GunrockGraph *graph_output =
-    (struct GunrockGraph*)malloc(sizeof(struct GunrockGraph));
-  int   *node_ids  = (int*)malloc(sizeof(int) * pr_config.top_nodes);
-  float *page_rank = (float*)malloc(sizeof(float) * pr_config.top_nodes);
-
-  // run pr calculations
-  gunrock_pr_func(
-    graph_output,
-    node_ids,
-    page_rank,
-    graph_input,
-    pr_config,
-    data_type);
-
-  // test print
-  int i;
-  printf("Demo Outputs:\n");
-  if (pr_config.top_nodes > num_nodes) pr_config.top_nodes = num_nodes;
-  for (i = 0; i < pr_config.top_nodes; ++i)
-  {
-    printf("Node ID [%d] : Page Rank [%f] \n", node_ids[i], page_rank[i]);
-  }
-
-  if (node_ids)     { free(node_ids);     }
-  if (page_rank)    { free(page_rank);    }
-  if (graph_input)  { free(graph_input);  }
-  if (graph_output) { free(graph_output); }
-
-  return 0;
+int main(int argc, char* argv[]) {
+    // define data types
+    struct GRTypes data_t;
+    data_t.VTXID_TYPE = VTXID_INT;    // integer type vertex_ids
+    data_t.SIZET_TYPE = SIZET_INT;    // integer type graph size
+    data_t.VALUE_TYPE = VALUE_FLOAT;  // float type value for pr
+
+    // pr configurations (optional)
+    struct GRSetup config;
+    config.device    =     0;  // use device 0
+    config.delta     = 0.85f;  // default delta value
+    config.error     = 0.01f;  // default error threshold
+    config.max_iter  =    20;  // maximum number of iterations
+    config.top_nodes =    10;  // number of top nodes
+    config.src_node  =     0;  // source node to begin page rank
+    config.src_mode  = manually;  // set source node manually
+
+    // define graph (undirected graph)
+    size_t num_nodes    = 7;
+    size_t num_edges    = 15;
+    int row_offsets[8]  = {0, 3, 6, 9, 11, 14, 15, 15};
+    int col_indices[15] = {1, 2, 3, 0, 2, 4, 3, 4, 5, 5, 6, 2, 5, 6, 6};
+
+    // build graph as input
+    struct GRGraph *graph_i = (struct GRGraph*)malloc(sizeof(struct GRGraph));
+    graph_i->num_nodes   = num_nodes;
+    graph_i->num_edges   = num_edges;
+    graph_i->row_offsets = (void*)&row_offsets[0];
+    graph_i->col_indices = (void*)&col_indices[0];
+
+    // malloc output graph
+    struct GRGraph *graph_o = (struct GRGraph*)malloc(sizeof(struct GRGraph));
+    int   *node_ids  = (int*)malloc(sizeof(int) * config.top_nodes);
+    float *pagerank = (float*)malloc(sizeof(float) * config.top_nodes);
+
+    // run pr calculations
+    gunrock_pagerank(graph_o, node_ids, pagerank, graph_i, config, data_t);
+
+    // test print
+    int i;
+    printf("Demo Outputs:\n");
+    if (config.top_nodes > num_nodes) config.top_nodes = num_nodes;
+    for (i = 0; i < config.top_nodes; ++i) {
+        printf("Node ID [%d] : Page Rank [%f] \n", node_ids[i], pagerank[i]);
+    }
+
+    // clean up
+    if (node_ids) { free(node_ids); }
+    if (pagerank) { free(pagerank); }
+    if (graph_i)  { free(graph_i);  }
+    if (graph_o)  { free(graph_o);  }
+
+    return 0;
 }
diff --git a/shared_lib_tests/test_sssp.c b/shared_lib_tests/test_sssp.c
index f4fc0fe5b..e22370a3d 100644
--- a/shared_lib_tests/test_sssp.c
+++ b/shared_lib_tests/test_sssp.c
@@ -1,75 +1,63 @@
 /**
  * @brief SSSP test for shared library
  * @file test_sssp.c
- *
- * set input graph, configs and call function gunrock_sssp_func
- * return per node or per edge values in graph_out node_values
  */
 
 #include <stdio.h>
 #include <gunrock/gunrock.h>
 
-int main(int argc, char* argv[])
-{
-  // define data types
-  struct GunrockDataType data_type;
-  data_type.VTXID_TYPE = VTXID_INT;
-  data_type.SIZET_TYPE = SIZET_INT;
-  data_type.VALUE_TYPE = VALUE_UINT;
-
-  // pr configurations (optional)
-  struct GunrockConfig sssp_config;
-  sssp_config.device       =    0;
-  sssp_config.mark_pred    = true;
-  sssp_config.queue_size   = 1.0f;
-  sssp_config.delta_factor =    1;
-  sssp_config.src_mode     = randomize;
-  //sssp_config.src_node     =    1;
-
-  // define graph
-  size_t num_nodes = 7;
-  size_t num_edges = 15;
-
-  int row_offsets[8]           = {0,3,6,9,11,14,15,15};
-  int col_indices[15]          = {1,2,3,0,2,4,3,4,5,5,6,2,5,6,6};
-  unsigned int edge_values[15] = {39,6,41,51,63,17,10,44,41,13,58,43,50,59,35};
-
-  // build graph as input
-  struct GunrockGraph *graph_input =
-    (struct GunrockGraph*)malloc(sizeof(struct GunrockGraph));
-  graph_input->num_nodes   = num_nodes;
-  graph_input->num_edges   = num_edges;
-  graph_input->row_offsets = (void*)&row_offsets[0];
-  graph_input->col_indices = (void*)&col_indices[0];
-  graph_input->edge_values = (void*)&edge_values[0];
-
-  // malloc output graph
-  struct GunrockGraph *graph_output =
-    (struct GunrockGraph*)malloc(sizeof(struct GunrockGraph));
-  int *predecessor = (int*)malloc(sizeof(int) * num_nodes);
-
-  // run sssp calculations
-  gunrock_sssp_func(
-    graph_output,
-    predecessor,
-    graph_input,
-    sssp_config,
-    data_type);
-
-  // test print
-  int i;
-  printf("Demo Outputs:\n");
-  int *label = (int*)malloc(sizeof(int) * num_nodes);
-  label = (int*)graph_output->node_values;
-  for (i = 0; i < num_nodes; ++i)
-  {
-    printf("Node ID [%d] : Label [%d] : Predecessor [%d]\n",
-           i, label[i], predecessor[i]);
-  }
-
-  if (predecessor)  { free(predecessor);  }
-  if (graph_input)  { free(graph_input);  }
-  if (graph_output) { free(graph_output); }
-
-  return 0;
+int main(int argc, char* argv[]) {
+    // define data types
+    struct GRTypes data_t;
+    data_t.VTXID_TYPE = VTXID_INT;
+    data_t.SIZET_TYPE = SIZET_INT;
+    data_t.VALUE_TYPE = VALUE_UINT;
+
+    // configurations (optional)
+    struct GRSetup config;
+    config.device       =    0;
+    config.mark_pred    = true;
+    config.queue_size   = 1.0f;
+    config.delta_factor =    1;
+    config.src_mode     = randomize;
+
+    // define graph
+    size_t num_nodes = 7;
+    size_t num_edges = 15;
+
+    int row_offsets[8]           = {0, 3, 6, 9, 11, 14, 15, 15};
+    int col_indices[15]          = {1, 2, 3, 0, 2, 4, 3, 4, 5, 5, 6, 2, 5, 6, 6};
+    unsigned int edge_values[15] = {39, 6, 41, 51, 63, 17, 10, 44, 41, 13, 58, 43, 50, 59, 35};
+
+    // build graph as input
+    struct GRGraph *graph_i = (struct GRGraph*)malloc(sizeof(struct GRGraph));
+    graph_i->num_nodes   = num_nodes;
+    graph_i->num_edges   = num_edges;
+    graph_i->row_offsets = (void*)&row_offsets[0];
+    graph_i->col_indices = (void*)&col_indices[0];
+    graph_i->edge_values = (void*)&edge_values[0];
+
+    // malloc output graph
+    struct GRGraph *graph_o = (struct GRGraph*)malloc(sizeof(struct GRGraph));
+    int *predecessor = (int*)malloc(sizeof(int) * num_nodes);
+
+    // run calculations
+    gunrock_sssp(graph_o, predecessor, graph_i, config, data_t);
+
+    // demo test print
+    printf("Demo Outputs:\n");
+    int *label = (int*)malloc(sizeof(int) * num_nodes);
+    label = (int*)graph_o->node_values;
+    int node;
+    for (node = 0; node < num_nodes; ++node) {
+        printf("Node ID [%d] : Label [%d] : Predecessor [%d]\n",
+               node, label[node], predecessor[node]);
+    }
+
+    // clean up
+    if (predecessor) { free(predecessor); }
+    if (graph_i) { free(graph_i); }
+    if (graph_o) { free(graph_o); }
+
+    return 0;
 }
diff --git a/shared_lib_tests/test_topk.c b/shared_lib_tests/test_topk.c
index 1feea5e97..416fe9f7c 100644
--- a/shared_lib_tests/test_topk.c
+++ b/shared_lib_tests/test_topk.c
@@ -1,68 +1,65 @@
+/**
+ * @brief Top K test for shared library
+ * @file test_topk.c
+ */
+
 #include <gunrock/gunrock.h>
 #include <stdio.h>
 
-int main(int argc, char* argv[])
-{
-  // define data types
-  struct GunrockDataType data_type;
-  data_type.VTXID_TYPE = VTXID_INT;
-  data_type.SIZET_TYPE = SIZET_INT;
-  data_type.VALUE_TYPE = VALUE_INT;
+int main(int argc, char* argv[]) {
+    // define data types
+    struct GRTypes data_t;
+    data_t.VTXID_TYPE = VTXID_INT;
+    data_t.SIZET_TYPE = SIZET_INT;
+    data_t.VALUE_TYPE = VALUE_INT;
+
+    struct GRSetup config;
+    config.device    = 0;
+    config.top_nodes = 3;
 
-  struct GunrockConfig topk_config;
-  topk_config.device    = 0;
-  topk_config.top_nodes = 3;
+    // define graph (directed, reversed and non-reversed)
+    size_t num_nodes = 7;
+    size_t num_edges = 15;
 
-  // define graph (directed, reversed and non-reversed)
-  size_t num_nodes = 7;
-  size_t num_edges = 15;
+    int row_offsets[8]  = {0, 3, 6, 9, 11, 14, 15, 15};
+    int col_indices[15] = {1, 2, 3, 0, 2, 4, 3, 4, 5, 5, 6, 2, 5, 6, 6};
 
-  int row_offsets[8] = {0,3,6,9,11,14,15,15};
-  int col_indices[15] = {1,2,3,0,2,4,3,4,5,5,6,2,5,6,6};
+    int col_offsets[8]  = {0, 1, 2, 5, 7, 9, 12, 15};
+    int row_indices[15] = {1, 0, 0, 1, 4, 0, 2, 1, 2, 2, 3, 4, 3, 4, 5};
 
-  int col_offsets[8] = {0,1,2,5,7,9,12,15};
-  int row_indices[15] = {1,0,0,1,4,0,2,1,2,2,3,4,3,4,5};
+    // build graph as input
+    struct GRGraph *graph_i = (struct GRGraph*)malloc(sizeof(struct GRGraph));
+    graph_i->num_nodes = num_nodes;
+    graph_i->num_edges = num_edges;
+    graph_i->row_offsets = (void*)&row_offsets[0];
+    graph_i->col_indices = (void*)&col_indices[0];
+    graph_i->col_offsets = (void*)&col_offsets[0];
+    graph_i->row_indices = (void*)&row_indices[0];
 
-  // build graph as input
-  struct GunrockGraph *graph_input =
-    (struct GunrockGraph*)malloc(sizeof(struct GunrockGraph));
-  graph_input->num_nodes = num_nodes;
-  graph_input->num_edges = num_edges;
-  graph_input->row_offsets = (void*)&row_offsets[0];
-  graph_input->col_indices = (void*)&col_indices[0];
-  graph_input->col_offsets = (void*)&col_offsets[0];
-  graph_input->row_indices = (void*)&row_indices[0];
+    // malloc output result arrays
+    struct GRGraph *graph_o = (struct GRGraph*)malloc(sizeof(struct GRGraph));
+    int *node_ids    = (int*)malloc(sizeof(int) * config.top_nodes);
+    int *in_degrees  = (int*)malloc(sizeof(int) * config.top_nodes);
+    int *out_degrees = (int*)malloc(sizeof(int) * config.top_nodes);
 
-  // malloc output result arrays
-  struct GunrockGraph *graph_output =
-    (struct GunrockGraph*)malloc(sizeof(struct GunrockGraph));
-  int *node_ids    = (int*)malloc(sizeof(int) * topk_config.top_nodes);
-  int *in_degrees  = (int*)malloc(sizeof(int) * topk_config.top_nodes);
-  int *out_degrees = (int*)malloc(sizeof(int) * topk_config.top_nodes);
+    // run topk calculations
+    gunrock_topk(
+        graph_o, node_ids, in_degrees, out_degrees, graph_i, config, data_t);
 
-  // run topk calculations
-  gunrock_topk_func(
-    graph_output,
-    node_ids,
-    in_degrees,
-    out_degrees,
-    graph_input,
-    topk_config,
-    data_type);
+    // print results for check correctness
+    printf("Demo Outputs:\n");
+    int node;
+    for (node = 0; node < config.top_nodes; ++node) {
+        printf("Node ID [%d] : in_degrees [%d] : out_degrees [%d] \n",
+               node_ids[node], in_degrees[node], out_degrees[node]);
+    }
 
-  // print results for check correctness
-  int i;
-  printf("Demo Outputs:\n");
-  for (i = 0; i < topk_config.top_nodes; ++i)
-  {
-    printf("Node ID [%d] : in_degrees [%d] : out_degrees [%d] \n",
-      node_ids[i], in_degrees[i], out_degrees[i]);
-  }
+    // clean up
+    if (in_degrees)  free(in_degrees);
+    if (out_degrees) free(out_degrees);
+    if (node_ids)    free(node_ids);
+    if (graph_i)     free(graph_i);
+    if (graph_o)     free(graph_o);
 
-  if (in_degrees)   free(in_degrees);
-  if (out_degrees)  free(out_degrees);
-  if (node_ids)     free(node_ids);
-  if (graph_input)  free(graph_input);
-  if (graph_output) free(graph_output);
-  return 0;
+    return 0;
 }
\ No newline at end of file
diff --git a/tests/hits/CMakeLists.txt b/tests/hits/CMakeLists.txt
index 2ba54cb95..ef9e22ff5 100644
--- a/tests/hits/CMakeLists.txt
+++ b/tests/hits/CMakeLists.txt
@@ -12,7 +12,7 @@ set (mgpu_SOURCE_FILES
   ${mgpu_SOURCE_DIRS}/mgpucontext.cu
   ${mgpu_SOURCE_DIRS}/mgpuutil.cpp)
 
-CUDA_ADD_EXECUTABLE(hyperlink_induced_topic_search
+CUDA_ADD_EXECUTABLE(HITS
   test_hits.cu
   ${CMAKE_SOURCE_DIR}/gunrock/util/test_utils.cu
   ${CMAKE_SOURCE_DIR}/gunrock/util/error_utils.cu
diff --git a/tests/mst/test_mst.cu b/tests/mst/test_mst.cu
index b563517ac..7c9eb5768 100644
--- a/tests/mst/test_mst.cu
+++ b/tests/mst/test_mst.cu
@@ -140,7 +140,7 @@ void DisplaySolution(const Csr<VertexId, Value, SizeT> &graph, int *mst_output)
 template<typename VertexId, typename Value, typename SizeT>
 bool IsConnected(const Csr<VertexId, Value, SizeT> & graph)
 {
-  GunrockGraph *temp = (GunrockGraph*)malloc(sizeof(GunrockGraph));
+  GRGraph *temp = (GRGraph*)malloc(sizeof(GRGraph));
   unsigned int *components = (unsigned int*)malloc(sizeof(unsigned int));
   run_cc<VertexId, Value, SizeT>(temp, components, graph, 0, 1);
   if (temp) free(temp);

From e75447fe76517d827ae234ec9c17689b2e43cd16 Mon Sep 17 00:00:00 2001
From: Yuduo Wu <yudwu@ucdavis.edu>
Date: Wed, 24 Jun 2015 11:25:43 -0700
Subject: [PATCH 27/36] a new simpler interface takes in csr

---
 gunrock/app/bfs/bfs_app.cu               | 59 ++++++++++++++++++++----
 gunrock/gunrock.h                        |  9 ++++
 shared_lib_tests/simple_interface_test.c | 27 +++++++++++
 3 files changed, 87 insertions(+), 8 deletions(-)
 create mode 100644 shared_lib_tests/simple_interface_test.c

diff --git a/gunrock/app/bfs/bfs_app.cu b/gunrock/app/bfs/bfs_app.cu
index 1fe0300b5..d02a51551 100644
--- a/gunrock/app/bfs/bfs_app.cu
+++ b/gunrock/app/bfs/bfs_app.cu
@@ -46,23 +46,23 @@ using namespace gunrock::app::bfs;
  * @param[in] context Reference to CudaContext used by moderngpu functions
  *
  */
-template<typename VertexId, typename Value, typename SizeT, 
-         bool MARK_PREDECESSORS, bool ENABLE_IDEMPOTENCE >
+template<typename VertexId, typename Value, typename SizeT,
+         bool MARK_PREDECESSORS, bool ENABLE_IDEMPOTENCE>
 void run_bfs(
     GRGraph      *graph_o,
-    const  Csr<VertexId, Value, SizeT> &csr,
-    const        VertexId src,
+    const Csr<VertexId, Value, SizeT> &csr,
+    const VertexId src,
     const int    max_grid_size,
     const int    num_gpus,
     const double max_queue_sizing,
     CudaContext  &context) {
-    typedef BFSProblem<VertexId, SizeT, Value, MARK_PREDECESSORS, 
+    typedef BFSProblem<VertexId, SizeT, Value, MARK_PREDECESSORS,
         ENABLE_IDEMPOTENCE, (MARK_PREDECESSORS && ENABLE_IDEMPOTENCE)> Problem;
     // Allocate host-side label array for gpu-computed results
     VertexId *h_labels = (VertexId*)malloc(sizeof(VertexId) * csr.nodes);
     VertexId *h_preds = NULL;
     if (MARK_PREDECESSORS) {
-        //h_preds = (VertexId*)malloc(sizeof(VertexId) * csr.nodes);
+        // h_preds = (VertexId*)malloc(sizeof(VertexId) * csr.nodes);
     }
 
     BFSEnactor<false> enactor(false);  // Allocate BFS enactor map
@@ -75,17 +75,23 @@ void run_bfs(
                       src, enactor.GetFrontierType(), max_queue_sizing),
                   "BFS Problem Data Reset Failed", __FILE__, __LINE__);
 
+    GpuTimer gpu_timer;
+    float elapsed = 0.0f;
+    gpu_timer.Start();
     util::GRError(enactor.template Enact<Problem>(
                       context, problem, src, max_grid_size),
                   "BFS Problem Enact Failed", __FILE__, __LINE__);
+    gpu_timer.Stop();
+    elapsed = gpu_timer.ElapsedMillis();
 
     util::GRError(problem->Extract(h_labels, h_preds),
                   "BFS Problem Data Extraction Failed", __FILE__, __LINE__);
 
-    graph_o->node_values = (int*)&h_labels[0];  // label per node to GRGraph struct
+    graph_o->node_values = (int*)&h_labels[0];  // label per node to graph_o
+    printf(" elapsed time: %.4f ms\n", elapsed);
 
     if (problem) delete problem;
-    //if (h_preds)     free(h_preds);
+    // if (h_preds)     free(h_preds);
     cudaDeviceSynchronize();
 }
 
@@ -233,6 +239,43 @@ void gunrock_bfs(
     dispatch_bfs(graph_o, graph_i, config, data_t, *context);
 }
 
+/*
+ * @brief bfs interface take in CSR arrays as input
+ */
+void bfs(
+    int       *bfs_label,
+    const int  num_nodes,
+    const int  num_edges,
+    const int *row_offsets,
+    const int *col_indices,
+    const int  source,
+    const int  device) {
+    printf("-------------------- setting --------------------\n");
+    struct GRTypes data_t;  // primitive-specific data types
+    data_t.VTXID_TYPE = VTXID_INT;  // integer
+    data_t.SIZET_TYPE = SIZET_INT;  // integer
+    data_t.VALUE_TYPE = VALUE_INT;  // integer
+    struct GRSetup config;  // primitive-specific configures
+    config.device      = device;  // setting device to run
+    config.src_node    = source;  // source vertex to begin
+    config.mark_pred   =  false;  // do not mark predecessors
+    config.idempotence =  false;  // wether enable idempotence
+    config.queue_size  =   1.0f;  // maximum queue size factor
+    struct GRGraph *graph_o = (struct GRGraph*)malloc(sizeof(struct GRGraph));
+    struct GRGraph *graph_i = (struct GRGraph*)malloc(sizeof(struct GRGraph));
+    graph_i->num_nodes   = num_nodes;
+    graph_i->num_edges   = num_edges;
+    graph_i->row_offsets = (void*)&row_offsets[0];
+    graph_i->col_indices = (void*)&col_indices[0];
+    printf(" loaded num nodes: %d, num edges: %d\n", num_nodes, num_edges);
+    printf("-------------------- running --------------------\n");
+    gunrock_bfs(graph_o, graph_i, config, data_t);
+    memcpy(bfs_label, (int*)graph_o->node_values, num_nodes * sizeof(int));
+    printf("-------------------- cleanup --------------------\n");
+    if (graph_i) free(graph_i);
+    if (graph_o) free(graph_o);
+}
+
 // Leave this at the end of the file
 // Local Variables:
 // mode:c++
diff --git a/gunrock/gunrock.h b/gunrock/gunrock.h
index 441d77a85..87caf4a19 100644
--- a/gunrock/gunrock.h
+++ b/gunrock/gunrock.h
@@ -101,6 +101,15 @@ void gunrock_bfs(
     struct       GRSetup  config,
     struct       GRTypes  data_t);
 
+void bfs(
+    int       *bfs_label,
+    const int  num_nodes,
+    const int  num_edges,
+    const int *row,
+    const int *col,
+    const int  src,
+    const int  dev);
+
 // betweenness centrality
 void gunrock_bc(
     struct       GRGraph *graph_o,
diff --git a/shared_lib_tests/simple_interface_test.c b/shared_lib_tests/simple_interface_test.c
new file mode 100644
index 000000000..d9d8b3ae0
--- /dev/null
+++ b/shared_lib_tests/simple_interface_test.c
@@ -0,0 +1,27 @@
+/**
+ * @brief Simple test for shared library simple interface
+ * @file simple_interface_test.c
+ */
+
+#include <stdio.h>
+#include <gunrock/gunrock.h>
+
+int main(int argc, char* argv[]) {
+    int row_offsets[] = {0, 3, 6, 9, 11, 14, 15, 15};
+    int col_indices[] = {1, 2, 3, 0, 2, 4, 3, 4, 5, 5, 6, 2, 5, 6, 6};
+    size_t num_nodes = sizeof(row_offsets) / sizeof(row_offsets[0]) - 1;
+    size_t num_edges = sizeof(col_indices) / sizeof(col_indices[0]);
+
+    int *labels = (int*)malloc(sizeof(int) * num_nodes);
+
+    // test simple breath-first search interface
+    bfs(labels, num_nodes, num_edges, row_offsets, col_indices, 0, 0);
+    printf("-------------------- outputs --------------------\n");
+    int node; for (node = 0; node < num_nodes; ++node) {
+        printf(" node: [%d] | label (depth): [%d]\n", node, labels[node]);
+    }
+    printf("------------------- completed -------------------\n");
+
+    if (labels) { free(labels); }
+    return 0;
+}

From 16248352adbfa7e0b14e3d79855fb4e9389d04b0 Mon Sep 17 00:00:00 2001
From: Yuduo Wu <yudwu@ucdavis.edu>
Date: Wed, 24 Jun 2015 11:26:09 -0700
Subject: [PATCH 28/36] update cmakelist

---
 shared_lib_tests/CMakeLists.txt | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/shared_lib_tests/CMakeLists.txt b/shared_lib_tests/CMakeLists.txt
index 96d2ee37a..7d880fae6 100644
--- a/shared_lib_tests/CMakeLists.txt
+++ b/shared_lib_tests/CMakeLists.txt
@@ -1,6 +1,9 @@
 # gunrock test rig cmake file
 # include_directories(${gunrock_INCLUDE_DIRS}/gunrock)
 
+add_executable(simple_interface_test simple_interface_test.c)
+target_link_libraries(simple_interface_test gunrock)
+
 add_executable(test_topk test_topk.c)
 target_link_libraries(test_topk gunrock)
 

From ac1d96360b9f414bad1a213f94836aa8df887d91 Mon Sep 17 00:00:00 2001
From: Yuduo Wu <yudwu@ucdavis.edu>
Date: Wed, 24 Jun 2015 20:22:15 -0700
Subject: [PATCH 29/36] update several interfaces

---
 gunrock/app/bfs/bfs_app.cu               |  22 ++--
 gunrock/gunrock.h                        | 124 ++++++++++++-----------
 shared_lib_tests/simple_interface_test.c |   2 +-
 3 files changed, 72 insertions(+), 76 deletions(-)

diff --git a/gunrock/app/bfs/bfs_app.cu b/gunrock/app/bfs/bfs_app.cu
index d02a51551..627e52554 100644
--- a/gunrock/app/bfs/bfs_app.cu
+++ b/gunrock/app/bfs/bfs_app.cu
@@ -52,7 +52,6 @@ void run_bfs(
     GRGraph      *graph_o,
     const Csr<VertexId, Value, SizeT> &csr,
     const VertexId src,
-    const int    max_grid_size,
     const int    num_gpus,
     const double max_queue_sizing,
     CudaContext  &context) {
@@ -75,14 +74,12 @@ void run_bfs(
                       src, enactor.GetFrontierType(), max_queue_sizing),
                   "BFS Problem Data Reset Failed", __FILE__, __LINE__);
 
-    GpuTimer gpu_timer;
-    float elapsed = 0.0f;
-    gpu_timer.Start();
-    util::GRError(enactor.template Enact<Problem>(
-                      context, problem, src, max_grid_size),
+    GpuTimer gpu_timer; float elapsed = 0.0f; gpu_timer.Start();  // start timer
+
+    util::GRError(enactor.template Enact<Problem>(context, problem, src),
                   "BFS Problem Enact Failed", __FILE__, __LINE__);
-    gpu_timer.Stop();
-    elapsed = gpu_timer.ElapsedMillis();
+
+    gpu_timer.Stop(); elapsed = gpu_timer.ElapsedMillis();  // calculate elapsed
 
     util::GRError(problem->Extract(h_labels, h_preds),
                   "BFS Problem Data Extraction Failed", __FILE__, __LINE__);
@@ -90,8 +87,8 @@ void run_bfs(
     graph_o->node_values = (int*)&h_labels[0];  // label per node to graph_o
     printf(" elapsed time: %.4f ms\n", elapsed);
 
-    if (problem) delete problem;
-    // if (h_preds)     free(h_preds);
+    if (problem) { delete problem; }
+    if (h_preds) {  free(h_preds); }
     cudaDeviceSynchronize();
 }
 
@@ -126,7 +123,6 @@ void dispatch_bfs(
                 // default configurations
                 int   src_node      = 0;  // default source vertex to start
                 int   num_gpus      = 1;  // number of GPUs for multi-gpu
-                int   max_grid_size = 0;  // leave it up to the enactor
                 bool  mark_pred     = 0;  // whether to mark predecessor or not
                 bool  idempotence   = 0;  // whether or not enable idempotence
                 float max_queue_sizing = 1.0f;  // maximum size scaling factor
@@ -161,7 +157,6 @@ void dispatch_bfs(
                             graph_o,
                             csr_graph,
                             src_node,
-                            max_grid_size,
                             num_gpus,
                             max_queue_sizing,
                             context);
@@ -170,7 +165,6 @@ void dispatch_bfs(
                             graph_o,
                             csr_graph,
                             src_node,
-                            max_grid_size,
                             num_gpus,
                             max_queue_sizing,
                             context);
@@ -181,7 +175,6 @@ void dispatch_bfs(
                             graph_o,
                             csr_graph,
                             src_node,
-                            max_grid_size,
                             num_gpus,
                             max_queue_sizing,
                             context);
@@ -190,7 +183,6 @@ void dispatch_bfs(
                             graph_o,
                             csr_graph,
                             src_node,
-                            max_grid_size,
                             num_gpus,
                             max_queue_sizing,
                             context);
diff --git a/gunrock/gunrock.h b/gunrock/gunrock.h
index 87caf4a19..a50309615 100644
--- a/gunrock/gunrock.h
+++ b/gunrock/gunrock.h
@@ -12,7 +12,6 @@
  * The Gunrock public interface is a C-only interface to enable linking
  * with code written in other languages. While the internals of Gunrock
  * are not limited to C.
- *
  */
 
 #include <stdlib.h>
@@ -54,14 +53,14 @@ struct GRTypes {
  * @brief GunrockGraph as a standard graph interface
  */
 struct GRGraph {
-    size_t num_nodes;     // number of nodes in graph
-    size_t num_edges;     // number of edges in graph
-    void   *row_offsets;  // CSR row offsets
-    void   *col_indices;  // CSR column indices
-    void   *col_offsets;  // CSC column offsets
-    void   *row_indices;  // CSC row indices
-    void   *node_values;  // associated values per node
-    void   *edge_values;  // associated values per edge
+    size_t  num_nodes;  // number of nodes in graph
+    size_t  num_edges;  // number of edges in graph
+    void *row_offsets;  // CSR row offsets
+    void *col_indices;  // CSR column indices
+    void *col_offsets;  // CSC column offsets
+    void *row_indices;  // CSC row indices
+    void *node_values;  // associated values per node
+    void *edge_values;  // associated values per edge
 };
 
 /**
@@ -77,17 +76,17 @@ enum SrcMode {
  * @brief arguments configuration used to specify arguments
  */
 struct GRSetup {
-    bool  mark_pred;         // whether to mark predecessor or not
-    bool  idempotence;       // whether or not to enable idempotent
-    int   src_node;          // source vertex define where to start
-    int   device;            // setting which device to use
-    int   max_iter;          // maximum number of iterations allowed
-    int   top_nodes;         // k value for top k / pagerank problem
-    int   delta_factor;      // sssp delta-factor parameter
-    float delta;             // pagerank specific value
-    float error;             // pagerank specific value
-    float queue_size;        // setting frontier queue size
-    enum  SrcMode src_mode;  // source mode rand/largest_degree
+    bool        mark_pred;  // whether to mark predecessor or not
+    bool      idempotence;  // whether or not to enable idempotent
+    int          src_node;  // source vertex define where to start
+    int            device;  // setting which device to use
+    int          max_iter;  // maximum number of iterations allowed
+    int         top_nodes;  // k value for top k / pagerank problem
+    int      delta_factor;  // sssp delta-factor parameter
+    float           delta;  // pagerank specific value
+    float           error;  // pagerank specific value
+    float      queue_size;  // setting frontier queue size
+    enum SrcMode src_mode;  // source mode rand/largest_degree
 };
 
 #ifdef __cplusplus
@@ -96,68 +95,73 @@ extern "C" {
 
 // breath-first search
 void gunrock_bfs(
-    struct       GRGraph *graph_o,
-    const struct GRGraph *graph_i,
-    struct       GRSetup  config,
-    struct       GRTypes  data_t);
+    struct GRGraph*       graph_o,
+    const struct GRGraph* graph_i,
+    const struct GRSetup  config,
+    const struct GRTypes  data_t);
 
+// simple interface
 void bfs(
-    int       *bfs_label,
+    int*       bfs_label,
     const int  num_nodes,
     const int  num_edges,
-    const int *row,
-    const int *col,
-    const int  src,
-    const int  dev);
+    const int* row_offsets,
+    const int* col_indices,
+    const int  source,
+    const int  device);
 
 // betweenness centrality
 void gunrock_bc(
-    struct       GRGraph *graph_o,
-    const struct GRGraph *graph_i,
-    struct       GRSetup  config,
-    struct       GRTypes  data_t);
+    struct GRGraph*       graph_o,
+    const struct GRGraph* graph_i,
+    const struct GRSetup  config,
+    const struct GRTypes  data_t);
 
 // connected component
 void gunrock_cc(
-    struct       GRGraph *graph_o,
-    unsigned int         *components,
-    const struct GRGraph *graph_i,
-    struct       GRSetup  config,
-    struct       GRTypes  data_t);
+    struct GRGraph*       graph_o,
+    unsigned int*         components,
+    const struct GRGraph* graph_i,
+    const struct GRSetup  config,
+    const struct GRTypes  data_t);
+/*
+int cc(int *component, const int  num_nodes, const int  num_edges,
+       const int *offsets, const int *indices, const int  device);
+*/
 
 // single-source shortest path
 void gunrock_sssp(
-    struct       GRGraph *graph_o,
-    void                 *predecessor,
-    const struct GRGraph *graph_i,
-    struct       GRSetup  config,
-    struct       GRTypes  data_t);
+    struct GRGraph*       graph_o,
+    void*                 predecessor,
+    const struct GRGraph* graph_i,
+    const struct GRSetup  config,
+    const struct GRTypes  data_t);
 
 // page-rank
 void gunrock_pagerank(
-    struct       GRGraph *graph_o,
-    void                 *node_ids,
-    void                 *pagerank,
-    const struct GRGraph *graph_i,
-    struct       GRSetup  config,
-    struct       GRTypes  data_t);
+    struct GRGraph*       graph_o,
+    void*                 node_ids,
+    void*                 pagerank,
+    const struct GRGraph* graph_i,
+    const struct GRSetup  config,
+    const struct GRTypes  data_t);
 
 // degree centrality
 void gunrock_topk(
-    struct       GRGraph *graph_o,
-    void                 *node_ids,
-    void                 *in_degrees,
-    void                 *out_degrees,
-    const struct GRGraph *graph_i,
-    struct       GRSetup  config,
-    struct       GRTypes  data_t);
+    struct  GRGraph*      graph_o,
+    void*                 node_ids,
+    void*                 in_degrees,
+    void*                 out_degrees,
+    const struct GRGraph* graph_i,
+    const struct GRSetup  config,
+    const struct GRTypes  data_t);
 
 // minimum spanning tree
 void gunrock_mst(
-    struct       GRGraph *graph_o,
-    const struct GRGraph *graph_i,
-    struct       GRSetup config,
-    struct       GRTypes data_t);
+    struct GRGraph*       graph_o,
+    const struct GRGraph* graph_i,
+    const struct GRSetup  config,
+    const struct GRTypes  data_t);
 
 // TODO(ydwu): Add other primitives
 
diff --git a/shared_lib_tests/simple_interface_test.c b/shared_lib_tests/simple_interface_test.c
index d9d8b3ae0..2d4842e94 100644
--- a/shared_lib_tests/simple_interface_test.c
+++ b/shared_lib_tests/simple_interface_test.c
@@ -14,7 +14,7 @@ int main(int argc, char* argv[]) {
 
     int *labels = (int*)malloc(sizeof(int) * num_nodes);
 
-    // test simple breath-first search interface
+    printf(" testing breath-first search ...\n");  // test bfs
     bfs(labels, num_nodes, num_edges, row_offsets, col_indices, 0, 0);
     printf("-------------------- outputs --------------------\n");
     int node; for (node = 0; node < num_nodes; ++node) {

From fbf72be8cc68ca966966f9fd158f2488fa1dc1c5 Mon Sep 17 00:00:00 2001
From: Yuduo Wu <yudwu@ucdavis.edu>
Date: Thu, 25 Jun 2015 08:25:10 -0700
Subject: [PATCH 30/36] added BC, CC simple interface

---
 gunrock/app/bc/bc_app.cu                 | 58 +++++++++++++++++-
 gunrock/app/bfs/bfs_app.cu               | 76 ++++++++++++++----------
 gunrock/app/cc/cc_app.cu                 | 63 ++++++++++++++++++--
 gunrock/gunrock.h                        | 64 ++++++++++++++++----
 shared_lib_tests/simple_interface_test.c | 29 ++++++---
 5 files changed, 230 insertions(+), 60 deletions(-)

diff --git a/gunrock/app/bc/bc_app.cu b/gunrock/app/bc/bc_app.cu
index af413d79b..6d1379b73 100644
--- a/gunrock/app/bc/bc_app.cu
+++ b/gunrock/app/bc/bc_app.cu
@@ -45,13 +45,13 @@ using namespace gunrock::app::bc;
  */
 template<typename VertexId, typename Value, typename SizeT>
 void run_bc(
-    GRGraph        *graph_o,
-    const Csr<VertexId, Value, SizeT> &csr,
+    GRGraph*       graph_o,
+    const Csr<VertexId, Value, SizeT>& csr,
     const VertexId source,
     const int      max_grid_size,
     const int      num_gpus,
     const double   max_queue_sizing,
-    CudaContext    &context) {
+    CudaContext&   context) {
     typedef BCProblem<VertexId, SizeT, Value, true, false > Problem;
     // Allocate host-side array (for both reference and gpu-computed results)
     Value *h_sigmas     = (Value*)malloc(sizeof(Value) * csr.nodes);
@@ -63,6 +63,8 @@ void run_bc(
     util::GRError(problem->Init(false, csr, num_gpus),
                   "BC Problem Initialization Failed", __FILE__, __LINE__);
 
+    GpuTimer gpu_timer; float elapsed = 0.0f; gpu_timer.Start();  // start timer
+
     VertexId start_source;
     VertexId end_source;
     if (source == -1) {
@@ -85,6 +87,9 @@ void run_bc(
     util::MemsetScaleKernel <<< 128, 128>>>(
         problem->data_slices[0]->d_bc_values, (Value)0.5f, (int)csr.nodes);
 
+    gpu_timer.Stop(); elapsed = gpu_timer.ElapsedMillis();  // calculate elapsed
+    printf(" device elapsed time: %.4f ms\n", elapsed);
+
     util::GRError(problem->Extract(h_sigmas, h_bc_values, h_ebc_values),
                   "BC Problem Data Extraction Failed", __FILE__, __LINE__);
 
@@ -204,6 +209,53 @@ void gunrock_bc(
     dispatch_bc(graph_o, graph_i, config, data_t, *context);
 }
 
+/*
+ * @brief Simple interface take in CSR arrays as input
+ * @param[out] bfs_label   Return BC node centrality per nodes
+ * @param[in]  num_nodes   Number of nodes of the input graph
+ * @param[in]  num_edges   Number of edges of the input graph
+ * @param[in]  row_offsets CSR-formatted graph input row offsets
+ * @param[in]  col_indices CSR-formatted graph input column indices
+ * @param[in]  source      Source to begin traverse
+ */
+void bc(
+    float*     bc_scores,
+    const int  num_nodes,
+    const int  num_edges,
+    const int* row_offsets,
+    const int* col_indices,
+    const int  source) {
+    printf("-------------------- setting --------------------\n");
+
+    struct GRTypes data_t;            // primitive-specific data types
+    data_t.VTXID_TYPE = VTXID_INT;    // integer
+    data_t.SIZET_TYPE = SIZET_INT;    // integer
+    data_t.VALUE_TYPE = VALUE_FLOAT;  // float BC scores
+
+    struct GRSetup config;          // primitive-specific configures
+    config.device      =      0;    // setting device to run
+    config.src_node    = source;    // source vertex to begin
+    config.queue_size  =   1.0f;    // maximum queue size factor
+
+    struct GRGraph *graph_o = (struct GRGraph*)malloc(sizeof(struct GRGraph));
+    struct GRGraph *graph_i = (struct GRGraph*)malloc(sizeof(struct GRGraph));
+    graph_i->num_nodes   = num_nodes;
+    graph_i->num_edges   = num_edges;
+    graph_i->row_offsets = (void*)&row_offsets[0];
+    graph_i->col_indices = (void*)&col_indices[0];
+
+    printf(" loaded %d nodes and %d edges\n", num_nodes, num_edges);    
+    
+    printf("-------------------- running --------------------\n");
+    gunrock_bc(graph_o, graph_i, config, data_t);
+    memcpy(bc_scores, (float*)graph_o->node_values, num_nodes * sizeof(float));
+    
+    if (graph_i) free(graph_i);
+    if (graph_o) free(graph_o);
+
+    printf("------------------- completed -------------------\n");
+}
+
 // Leave this at the end of the file
 // Local Variables:
 // mode:c++
diff --git a/gunrock/app/bfs/bfs_app.cu b/gunrock/app/bfs/bfs_app.cu
index 627e52554..b0421f48b 100644
--- a/gunrock/app/bfs/bfs_app.cu
+++ b/gunrock/app/bfs/bfs_app.cu
@@ -49,15 +49,15 @@ using namespace gunrock::app::bfs;
 template<typename VertexId, typename Value, typename SizeT,
          bool MARK_PREDECESSORS, bool ENABLE_IDEMPOTENCE>
 void run_bfs(
-    GRGraph      *graph_o,
-    const Csr<VertexId, Value, SizeT> &csr,
+    GRGraph*       graph_o,
+    const Csr<VertexId, Value, SizeT>& csr,
     const VertexId src,
-    const int    num_gpus,
-    const double max_queue_sizing,
-    CudaContext  &context) {
+    const int      num_gpus,
+    const double   max_queue_sizing,
+    CudaContext&   context) {
     typedef BFSProblem<VertexId, SizeT, Value, MARK_PREDECESSORS,
         ENABLE_IDEMPOTENCE, (MARK_PREDECESSORS && ENABLE_IDEMPOTENCE)> Problem;
-    // Allocate host-side label array for gpu-computed results
+    // Allocate host-side label array for GPU-computed results
     VertexId *h_labels = (VertexId*)malloc(sizeof(VertexId) * csr.nodes);
     VertexId *h_preds = NULL;
     if (MARK_PREDECESSORS) {
@@ -80,12 +80,12 @@ void run_bfs(
                   "BFS Problem Enact Failed", __FILE__, __LINE__);
 
     gpu_timer.Stop(); elapsed = gpu_timer.ElapsedMillis();  // calculate elapsed
-
+    printf(" device elapsed time: %.4f ms\n", elapsed);
+    
     util::GRError(problem->Extract(h_labels, h_preds),
                   "BFS Problem Data Extraction Failed", __FILE__, __LINE__);
 
     graph_o->node_values = (int*)&h_labels[0];  // label per node to graph_o
-    printf(" elapsed time: %.4f ms\n", elapsed);
 
     if (problem) { delete problem; }
     if (h_preds) {  free(h_preds); }
@@ -102,11 +102,11 @@ void run_bfs(
  * @param[in]  context  ModernGPU context
  */
 void dispatch_bfs(
-    GRGraph       *graph_o,
-    const GRGraph *graph_i,
+    GRGraph*       graph_o,
+    const GRGraph* graph_i,
     const GRSetup  config,
     const GRTypes  data_t,
-    CudaContext   &context) {
+    CudaContext&   context) {
     switch (data_t.VTXID_TYPE) {
     case VTXID_INT: {
         switch (data_t.SIZET_TYPE) {
@@ -122,12 +122,12 @@ void dispatch_bfs(
 
                 // default configurations
                 int   src_node      = 0;  // default source vertex to start
-                int   num_gpus      = 1;  // number of GPUs for multi-gpu
+                int   num_gpus      = 1;  // number of GPUs for multi-GPU
                 bool  mark_pred     = 0;  // whether to mark predecessor or not
-                bool  idempotence   = 0;  // whether or not enable idempotence
+                bool  idempotence   = 0;  // whether or not enable idempotent
                 float max_queue_sizing = 1.0f;  // maximum size scaling factor
 
-                // determine source vertex to start bfs
+                // determine source vertex to start
                 switch (config.src_mode) {
                 case randomize: {
                     src_node = graphio::RandomNode(csr_graph.nodes);
@@ -215,14 +215,14 @@ void dispatch_bfs(
 /*
  * @brief gunrock_bfs function
  *
- * @param[out] graph_o output subgraph of bfs problem
+ * @param[out] graph_o output subgraph of the problem
  * @param[in]  graph_i input graph need to process on
  * @param[in]  config  gunrock primitive specific configurations
  * @param[in]  data_t  gunrock data_t struct
  */
 void gunrock_bfs(
-    GRGraph       *graph_o,
-    const GRGraph *graph_i,
+    GRGraph*       graph_o,
+    const GRGraph* graph_i,
     const GRSetup  config,
     const GRTypes  data_t) {
     unsigned int device = 0;
@@ -232,40 +232,52 @@ void gunrock_bfs(
 }
 
 /*
- * @brief bfs interface take in CSR arrays as input
+ * @brief Simple interface take in CSR arrays as input
+ * @param[out] bfs_label   Return BFS labels per nodes
+ * @param[in]  num_nodes   Number of nodes of the input graph
+ * @param[in]  num_edges   Number of edges of the input graph
+ * @param[in]  row_offsets CSR-formatted graph input row offsets
+ * @param[in]  col_indices CSR-formatted graph input column indices
+ * @param[in]  source      Source to begin traverse
  */
 void bfs(
-    int       *bfs_label,
+    int*       bfs_label,
     const int  num_nodes,
     const int  num_edges,
-    const int *row_offsets,
-    const int *col_indices,
-    const int  source,
-    const int  device) {
+    const int* row_offsets,
+    const int* col_indices,
+    const int  source) {
     printf("-------------------- setting --------------------\n");
-    struct GRTypes data_t;  // primitive-specific data types
+
+    struct GRTypes data_t;          // primitive-specific data types
     data_t.VTXID_TYPE = VTXID_INT;  // integer
     data_t.SIZET_TYPE = SIZET_INT;  // integer
     data_t.VALUE_TYPE = VALUE_INT;  // integer
-    struct GRSetup config;  // primitive-specific configures
-    config.device      = device;  // setting device to run
-    config.src_node    = source;  // source vertex to begin
-    config.mark_pred   =  false;  // do not mark predecessors
-    config.idempotence =  false;  // wether enable idempotence
-    config.queue_size  =   1.0f;  // maximum queue size factor
+
+    struct GRSetup config;          // primitive-specific configures
+    config.device      =      0;    // setting device to run
+    config.src_node    = source;    // source vertex to begin
+    config.mark_pred   =  false;    // do not mark predecessors
+    config.idempotence =  false;    // whether enable idempotent
+    config.queue_size  =   1.0f;    // maximum queue size factor
+
     struct GRGraph *graph_o = (struct GRGraph*)malloc(sizeof(struct GRGraph));
     struct GRGraph *graph_i = (struct GRGraph*)malloc(sizeof(struct GRGraph));
     graph_i->num_nodes   = num_nodes;
     graph_i->num_edges   = num_edges;
     graph_i->row_offsets = (void*)&row_offsets[0];
     graph_i->col_indices = (void*)&col_indices[0];
-    printf(" loaded num nodes: %d, num edges: %d\n", num_nodes, num_edges);
+
+    printf(" loaded %d nodes and %d edges\n", num_nodes, num_edges);    
+    
     printf("-------------------- running --------------------\n");
     gunrock_bfs(graph_o, graph_i, config, data_t);
     memcpy(bfs_label, (int*)graph_o->node_values, num_nodes * sizeof(int));
-    printf("-------------------- cleanup --------------------\n");
+    
     if (graph_i) free(graph_i);
     if (graph_o) free(graph_o);
+
+    printf("------------------- completed -------------------\n");
 }
 
 // Leave this at the end of the file
diff --git a/gunrock/app/cc/cc_app.cu b/gunrock/app/cc/cc_app.cu
index 1f49d0e2b..ccab7ad2a 100644
--- a/gunrock/app/cc/cc_app.cu
+++ b/gunrock/app/cc/cc_app.cu
@@ -40,9 +40,9 @@ using namespace gunrock::app::cc;
  */
 template<typename VertexId, typename Value, typename SizeT>
 void run_cc(
-    GRGraph      *graph_o,
-    unsigned int *components,
-    const Csr<VertexId, Value, SizeT> &csr,
+    GRGraph*      graph_o,
+    unsigned int* components,
+    const Csr<VertexId, Value, SizeT>& csr,
     const int    max_grid_size,
     const int    num_gpus) {
     typedef CCProblem<VertexId, SizeT, Value, true> Problem; // double buffer
@@ -60,10 +60,15 @@ void run_cc(
                       cc_enactor.GetFrontierType()),
                   "CC Problem Data Reset Failed", __FILE__, __LINE__);
 
+    GpuTimer gpu_timer; float elapsed = 0.0f; gpu_timer.Start();  // start timer
+
     util::GRError(cc_enactor.template Enact<Problem>(
                       problem, max_grid_size),
                   "CC Problem Enact Failed", __FILE__, __LINE__);
 
+    gpu_timer.Stop(); elapsed = gpu_timer.ElapsedMillis();  // calculate elapsed
+    printf(" device elapsed time: %.4f ms\n", elapsed);
+
     util::GRError(problem->Extract(h_component_ids),
                   "CC Problem Data Extraction Failed", __FILE__, __LINE__);
 
@@ -87,9 +92,9 @@ void run_cc(
  * @param[in]  data_t  data type configurations
  */
 void dispatch_cc(
-    GRGraph       *graph_o,
-    unsigned int  *components,
-    const GRGraph *graph_i,
+    GRGraph*       graph_o,
+    unsigned int*  components,
+    const GRGraph* graph_i,
     const GRSetup  config,
     const GRTypes  data_t) {
     switch (data_t.VTXID_TYPE) {
@@ -154,6 +159,52 @@ void gunrock_cc(
     dispatch_cc(graph_o, components, graph_i, config, data_t);
 }
 
+/*
+ * @brief Simple interface take in CSR arrays as input
+ * @param[out] components  Return component ID for each node
+ * @param[out] num_comps   Return number of components calculated
+ * @param[in]  num_nodes   Number of nodes of the input graph
+ * @param[in]  num_edges   Number of edges of the input graph
+ * @param[in]  row_offsets CSR-formatted graph input row offsets
+ * @param[in]  col_indices CSR-formatted graph input column indices
+ */
+int cc(
+    int*       components,
+    const int  num_nodes,
+    const int  num_edges,
+    const int* row_offsets,
+    const int* col_indices) {
+    printf("-------------------- setting --------------------\n");
+
+    struct GRTypes data_t;          // primitive-specific data types
+    data_t.VTXID_TYPE = VTXID_INT;  // integer
+    data_t.SIZET_TYPE = SIZET_INT;  // integer
+    data_t.VALUE_TYPE = VALUE_INT;  // integer
+
+    struct GRSetup config;  // primitive-specific configures
+    config.device = 0;      // setting device to run
+
+    unsigned int num_components = 0;
+    struct GRGraph *graph_o = (struct GRGraph*)malloc(sizeof(struct GRGraph));
+    struct GRGraph *graph_i = (struct GRGraph*)malloc(sizeof(struct GRGraph));
+    graph_i->num_nodes   = num_nodes;
+    graph_i->num_edges   = num_edges;
+    graph_i->row_offsets = (void*)&row_offsets[0];
+    graph_i->col_indices = (void*)&col_indices[0];
+
+    printf(" loaded %d nodes and %d edges\n", num_nodes, num_edges);    
+    
+    printf("-------------------- running --------------------\n");
+    gunrock_cc(graph_o, &num_components, graph_i, config, data_t);
+    memcpy(components, (int*)graph_o->node_values, num_nodes * sizeof(int));
+    
+    if (graph_i) free(graph_i);
+    if (graph_o) free(graph_o);
+
+    printf("------------------- completed -------------------\n");
+    return num_components;
+}
+
 // Leave this at the end of the file
 // Local Variables:
 // mode:c++
diff --git a/gunrock/gunrock.h b/gunrock/gunrock.h
index a50309615..2511701fa 100644
--- a/gunrock/gunrock.h
+++ b/gunrock/gunrock.h
@@ -93,43 +93,60 @@ struct GRSetup {
 extern "C" {
 #endif
 
-// breath-first search
+/**
+ * breath-first search
+ */
 void gunrock_bfs(
     struct GRGraph*       graph_o,
     const struct GRGraph* graph_i,
     const struct GRSetup  config,
     const struct GRTypes  data_t);
 
-// simple interface
 void bfs(
     int*       bfs_label,
     const int  num_nodes,
     const int  num_edges,
     const int* row_offsets,
     const int* col_indices,
-    const int  source,
-    const int  device);
+    const int  source);
 
-// betweenness centrality
+/**
+ * betweenness centrality
+ */
 void gunrock_bc(
     struct GRGraph*       graph_o,
     const struct GRGraph* graph_i,
     const struct GRSetup  config,
     const struct GRTypes  data_t);
 
-// connected component
+void bc(
+    float*     bc_scores,
+    const int  num_nodes,
+    const int  num_edges,
+    const int* row_offsets,
+    const int* col_indices,
+    const int  source);
+
+/**
+ * connected component
+ */
 void gunrock_cc(
     struct GRGraph*       graph_o,
     unsigned int*         components,
     const struct GRGraph* graph_i,
     const struct GRSetup  config,
     const struct GRTypes  data_t);
-/*
-int cc(int *component, const int  num_nodes, const int  num_edges,
-       const int *offsets, const int *indices, const int  device);
-*/
 
-// single-source shortest path
+int cc(
+    int*       component,
+    const int  num_nodes,
+    const int  num_edges,
+    const int* row_offsets,
+    const int* col_indices);
+
+/**
+ * single-source shortest path
+ */
 void gunrock_sssp(
     struct GRGraph*       graph_o,
     void*                 predecessor,
@@ -137,7 +154,15 @@ void gunrock_sssp(
     const struct GRSetup  config,
     const struct GRTypes  data_t);
 
-// page-rank
+void sssp(
+    int*       distances,
+    const int  num_nodes,
+    const int  num_edges,
+    const int* row_offsets,
+    const int* col_indices,
+    const int  source);
+
+// pagerank
 void gunrock_pagerank(
     struct GRGraph*       graph_o,
     void*                 node_ids,
@@ -146,6 +171,14 @@ void gunrock_pagerank(
     const struct GRSetup  config,
     const struct GRTypes  data_t);
 
+void pagerank(
+    int*       node_ids,
+    float*     pagerank,
+    const int  num_nodes,
+    const int  num_edges,
+    const int* row_offsets,
+    const int* col_indices);
+
 // degree centrality
 void gunrock_topk(
     struct  GRGraph*      graph_o,
@@ -163,6 +196,13 @@ void gunrock_mst(
     const struct GRSetup  config,
     const struct GRTypes  data_t);
 
+void mst(
+    bool*      edge_mask,
+    const int  num_nodes,
+    const int  num_edges,
+    const int* row_offsets,
+    const int* col_indices);
+
 // TODO(ydwu): Add other primitives
 
 #ifdef __cplusplus
diff --git a/shared_lib_tests/simple_interface_test.c b/shared_lib_tests/simple_interface_test.c
index 2d4842e94..baf84ff6d 100644
--- a/shared_lib_tests/simple_interface_test.c
+++ b/shared_lib_tests/simple_interface_test.c
@@ -7,20 +7,35 @@
 #include <gunrock/gunrock.h>
 
 int main(int argc, char* argv[]) {
-    int row_offsets[] = {0, 3, 6, 9, 11, 14, 15, 15};
-    int col_indices[] = {1, 2, 3, 0, 2, 4, 3, 4, 5, 5, 6, 2, 5, 6, 6};
+    int row_offsets[] = {0, 3, 6, 11, 15, 19, 23, 26};
+    int col_indices[] = {1, 2, 3, 0, 2, 4, 0, 1, 3, 4, 5, 0, 2,
+                         5, 6, 1, 2, 5, 6, 2, 3, 4, 6, 3, 4, 5};
     size_t num_nodes = sizeof(row_offsets) / sizeof(row_offsets[0]) - 1;
     size_t num_edges = sizeof(col_indices) / sizeof(col_indices[0]);
 
+    printf("\n testing breath-first search ...\n");
     int *labels = (int*)malloc(sizeof(int) * num_nodes);
-
-    printf(" testing breath-first search ...\n");  // test bfs
-    bfs(labels, num_nodes, num_edges, row_offsets, col_indices, 0, 0);
-    printf("-------------------- outputs --------------------\n");
+    bfs(labels, num_nodes, num_edges, row_offsets, col_indices, 0);
     int node; for (node = 0; node < num_nodes; ++node) {
         printf(" node: [%d] | label (depth): [%d]\n", node, labels[node]);
     }
-    printf("------------------- completed -------------------\n");
+
+    printf("\n testing betweenness centrality ...\n");
+    float *scores = (float*)malloc(sizeof(float) * num_nodes);
+    bc(scores, num_nodes, num_edges, row_offsets, col_indices, -1);
+    for (node = 0; node < num_nodes; ++node) {
+        printf(" node: [%d] | score: [%.4f]\n", node, scores[node]);
+    }
+
+    printf("\n testing connected components ...\n");
+    int *components = (int*)malloc(sizeof(int) * num_nodes);
+    int ret = cc(components, num_nodes, num_edges, row_offsets, col_indices);
+    printf(" total number of components: %d\n", ret);
+    for (node = 0; node < num_nodes; ++node) {
+      printf(" node: [%d] | component: [%d]\n", node, components[node]);
+    }
+
+    // TODO(ydwu): add other primitive tests
 
     if (labels) { free(labels); }
     return 0;

From 495214c187f5de3e583f7b2d9d0577d52607c90a Mon Sep 17 00:00:00 2001
From: Yuduo Wu <yudwu@ucdavis.edu>
Date: Thu, 25 Jun 2015 12:12:40 -0700
Subject: [PATCH 31/36] added pr sssp and cc simple interface

---
 gunrock/app/bc/bc_app.cu                 | 13 ++--
 gunrock/app/bfs/bfs_app.cu               | 13 ++--
 gunrock/app/cc/cc_app.cu                 | 13 ++--
 gunrock/app/pr/pr_app.cu                 | 87 +++++++++++++++--------
 gunrock/app/pr/pr_enactor.cuh            |  9 ++-
 gunrock/app/sssp/sssp_app.cu             | 90 +++++++++++++++++++-----
 gunrock/gunrock.h                        | 13 ++--
 shared_lib_tests/simple_interface_test.c | 69 ++++++++++++++----
 8 files changed, 219 insertions(+), 88 deletions(-)

diff --git a/gunrock/app/bc/bc_app.cu b/gunrock/app/bc/bc_app.cu
index 6d1379b73..353d107ba 100644
--- a/gunrock/app/bc/bc_app.cu
+++ b/gunrock/app/bc/bc_app.cu
@@ -63,7 +63,7 @@ void run_bc(
     util::GRError(problem->Init(false, csr, num_gpus),
                   "BC Problem Initialization Failed", __FILE__, __LINE__);
 
-    GpuTimer gpu_timer; float elapsed = 0.0f; gpu_timer.Start();  // start timer
+    GpuTimer gpu_timer; float elapsed = 0.0f; gpu_timer.Start();  // start
 
     VertexId start_source;
     VertexId end_source;
@@ -87,13 +87,13 @@ void run_bc(
     util::MemsetScaleKernel <<< 128, 128>>>(
         problem->data_slices[0]->d_bc_values, (Value)0.5f, (int)csr.nodes);
 
-    gpu_timer.Stop(); elapsed = gpu_timer.ElapsedMillis();  // calculate elapsed
+    gpu_timer.Stop(); elapsed = gpu_timer.ElapsedMillis();  // elapsed time
     printf(" device elapsed time: %.4f ms\n", elapsed);
 
     util::GRError(problem->Extract(h_sigmas, h_bc_values, h_ebc_values),
                   "BC Problem Data Extraction Failed", __FILE__, __LINE__);
 
-    graph_o->node_values = (float*)&h_bc_values[0];   // h_bc_values per node 
+    graph_o->node_values = (float*)&h_bc_values[0];   // h_bc_values per node
     graph_o->edge_values = (float*)&h_ebc_values[0];  // h_ebc_values per edge
 
     if (problem) { delete problem; }
@@ -239,17 +239,18 @@ void bc(
 
     struct GRGraph *graph_o = (struct GRGraph*)malloc(sizeof(struct GRGraph));
     struct GRGraph *graph_i = (struct GRGraph*)malloc(sizeof(struct GRGraph));
+
     graph_i->num_nodes   = num_nodes;
     graph_i->num_edges   = num_edges;
     graph_i->row_offsets = (void*)&row_offsets[0];
     graph_i->col_indices = (void*)&col_indices[0];
 
-    printf(" loaded %d nodes and %d edges\n", num_nodes, num_edges);    
-    
+    printf(" loaded %d nodes and %d edges\n", num_nodes, num_edges);
+
     printf("-------------------- running --------------------\n");
     gunrock_bc(graph_o, graph_i, config, data_t);
     memcpy(bc_scores, (float*)graph_o->node_values, num_nodes * sizeof(float));
-    
+
     if (graph_i) free(graph_i);
     if (graph_o) free(graph_o);
 
diff --git a/gunrock/app/bfs/bfs_app.cu b/gunrock/app/bfs/bfs_app.cu
index b0421f48b..b6313fffa 100644
--- a/gunrock/app/bfs/bfs_app.cu
+++ b/gunrock/app/bfs/bfs_app.cu
@@ -74,14 +74,14 @@ void run_bfs(
                       src, enactor.GetFrontierType(), max_queue_sizing),
                   "BFS Problem Data Reset Failed", __FILE__, __LINE__);
 
-    GpuTimer gpu_timer; float elapsed = 0.0f; gpu_timer.Start();  // start timer
+    GpuTimer gpu_timer; float elapsed = 0.0f; gpu_timer.Start();  // start
 
     util::GRError(enactor.template Enact<Problem>(context, problem, src),
                   "BFS Problem Enact Failed", __FILE__, __LINE__);
 
-    gpu_timer.Stop(); elapsed = gpu_timer.ElapsedMillis();  // calculate elapsed
+    gpu_timer.Stop(); elapsed = gpu_timer.ElapsedMillis();  // elapsed time
     printf(" device elapsed time: %.4f ms\n", elapsed);
-    
+
     util::GRError(problem->Extract(h_labels, h_preds),
                   "BFS Problem Data Extraction Failed", __FILE__, __LINE__);
 
@@ -263,17 +263,18 @@ void bfs(
 
     struct GRGraph *graph_o = (struct GRGraph*)malloc(sizeof(struct GRGraph));
     struct GRGraph *graph_i = (struct GRGraph*)malloc(sizeof(struct GRGraph));
+
     graph_i->num_nodes   = num_nodes;
     graph_i->num_edges   = num_edges;
     graph_i->row_offsets = (void*)&row_offsets[0];
     graph_i->col_indices = (void*)&col_indices[0];
 
-    printf(" loaded %d nodes and %d edges\n", num_nodes, num_edges);    
-    
+    printf(" loaded %d nodes and %d edges\n", num_nodes, num_edges);
+
     printf("-------------------- running --------------------\n");
     gunrock_bfs(graph_o, graph_i, config, data_t);
     memcpy(bfs_label, (int*)graph_o->node_values, num_nodes * sizeof(int));
-    
+
     if (graph_i) free(graph_i);
     if (graph_o) free(graph_o);
 
diff --git a/gunrock/app/cc/cc_app.cu b/gunrock/app/cc/cc_app.cu
index ccab7ad2a..b4ac393b6 100644
--- a/gunrock/app/cc/cc_app.cu
+++ b/gunrock/app/cc/cc_app.cu
@@ -49,7 +49,7 @@ void run_cc(
 
     // Allocate host-side label array for gpu-computed results
     VertexId *h_component_ids
-        = (VertexId*)malloc(sizeof(VertexId) * csr.nodes);    
+        = (VertexId*)malloc(sizeof(VertexId) * csr.nodes);
     CCEnactor<false> cc_enactor(false);  // Allocate CC enactor map
     Problem *problem = new Problem;  // Allocate problem on GPU
 
@@ -60,13 +60,13 @@ void run_cc(
                       cc_enactor.GetFrontierType()),
                   "CC Problem Data Reset Failed", __FILE__, __LINE__);
 
-    GpuTimer gpu_timer; float elapsed = 0.0f; gpu_timer.Start();  // start timer
+    GpuTimer gpu_timer; float elapsed = 0.0f; gpu_timer.Start();  // start
 
     util::GRError(cc_enactor.template Enact<Problem>(
                       problem, max_grid_size),
                   "CC Problem Enact Failed", __FILE__, __LINE__);
 
-    gpu_timer.Stop(); elapsed = gpu_timer.ElapsedMillis();  // calculate elapsed
+    gpu_timer.Stop(); elapsed = gpu_timer.ElapsedMillis();  // elapsed time
     printf(" device elapsed time: %.4f ms\n", elapsed);
 
     util::GRError(problem->Extract(h_component_ids),
@@ -187,17 +187,18 @@ int cc(
     unsigned int num_components = 0;
     struct GRGraph *graph_o = (struct GRGraph*)malloc(sizeof(struct GRGraph));
     struct GRGraph *graph_i = (struct GRGraph*)malloc(sizeof(struct GRGraph));
+
     graph_i->num_nodes   = num_nodes;
     graph_i->num_edges   = num_edges;
     graph_i->row_offsets = (void*)&row_offsets[0];
     graph_i->col_indices = (void*)&col_indices[0];
 
-    printf(" loaded %d nodes and %d edges\n", num_nodes, num_edges);    
-    
+    printf(" loaded %d nodes and %d edges\n", num_nodes, num_edges);
+
     printf("-------------------- running --------------------\n");
     gunrock_cc(graph_o, &num_components, graph_i, config, data_t);
     memcpy(components, (int*)graph_o->node_values, num_nodes * sizeof(int));
-    
+
     if (graph_i) free(graph_i);
     if (graph_o) free(graph_o);
 
diff --git a/gunrock/app/pr/pr_app.cu b/gunrock/app/pr/pr_app.cu
index 47a9e5862..2d6d2c376 100644
--- a/gunrock/app/pr/pr_app.cu
+++ b/gunrock/app/pr/pr_app.cu
@@ -51,9 +51,8 @@ template<typename VertexId, typename Value, typename SizeT>
  void run_pagerank(
     GRGraph        *graph_o,
     VertexId       *node_ids,
-    Value          *page_rank,
+    Value          *pagerank,
     const Csr<VertexId, Value, SizeT> &csr,
-    const VertexId source,
     const Value    delta,
     const Value    error,
     const SizeT    max_iter,
@@ -67,15 +66,19 @@ template<typename VertexId, typename Value, typename SizeT>
     util::GRError(problem->Init(false, csr, num_gpus),
                   "PR Problem Initialization Failed", __FILE__, __LINE__);
 
-    util::GRError(problem->Reset(
-                      source, delta, error, enactor.GetFrontierType()),
+    util::GRError(problem->Reset(0, delta, error, enactor.GetFrontierType()),
                   "PR Problem Data Reset Failed", __FILE__, __LINE__);
 
+    GpuTimer gpu_timer; float elapsed = 0.0f; gpu_timer.Start();  // start
+
     util::GRError(enactor.template Enact<Problem>(
                       context, problem, max_iter, max_grid_size),
                   "PR Problem Enact Failed", __FILE__, __LINE__);
 
-    util::GRError(problem->Extract(page_rank, node_ids),
+    gpu_timer.Stop(); elapsed = gpu_timer.ElapsedMillis();  // elapsed time
+    printf(" device elapsed time: %.4f ms\n", elapsed);
+
+    util::GRError(problem->Extract(pagerank, node_ids),
                   "PR Problem Extraction Failed", __FILE__, __LINE__);
 
     if (problem) delete problem;
@@ -122,34 +125,13 @@ void dispatch_pagerank(
                 csr_graph.row_offsets    = (int*)graph_i->row_offsets;
                 csr_graph.column_indices = (int*)graph_i->col_indices;
 
-                // page-rank configurations
+                // pagerank configurations
                 float delta         = 0.85f;  // default delta value
                 float error         = 0.01f;  // error threshold
                 int   max_iter      = 20;     // maximum number of iterations
                 int   max_grid_size = 0;      // 0: leave it up to the enactor
                 int   num_gpus      = 1;      // for multi-gpu enactor to use
-                int   src_node      = -1;     // source node to start
-
-                // determine source vertex to start sssp
-                switch (config.src_mode) {
-                case randomize: {
-                    src_node = graphio::RandomNode(csr_graph.nodes);
-                    break;
-                }
-                case largest_degree: {
-                    int max_node = 0;
-                    src_node = csr_graph.GetNodeWithHighestDegree(max_node);
-                    break;
-                }
-                case manually: {
-                    src_node = config.src_node;
-                    break;
-                }
-                default: {
-                    src_node = -1;
-                    break;
-                }
-                }
+
                 delta    = config.delta;
                 error    = config.error;
                 max_iter = config.max_iter;
@@ -159,7 +141,6 @@ void dispatch_pagerank(
                     (int*)node_ids,
                     (float*)pagerank,
                     csr_graph,
-                    src_node,
                     delta,
                     error,
                     max_iter,
@@ -205,6 +186,54 @@ void gunrock_pagerank(
         graph_o, node_ids, pagerank, graph_i, config, data_t, *context);
 }
 
+/*
+ * @brief Simple interface take in CSR arrays as input
+ * @param[out] pagerank    Return PageRank scores per node
+ * @param[in]  num_nodes   Number of nodes of the input graph
+ * @param[in]  num_edges   Number of edges of the input graph
+ * @param[in]  row_offsets CSR-formatted graph input row offsets
+ * @param[in]  col_indices CSR-formatted graph input column indices
+ * @param[in]  source      Source to begin traverse
+ */
+void pagerank(
+    int*                node_ids,
+    float*              pagerank,
+    const int           num_nodes,
+    const int           num_edges,
+    const int*          row_offsets,
+    const int*          col_indices) {
+    printf("-------------------- setting --------------------\n");
+
+    struct GRTypes data_t;            // primitive-specific data types
+    data_t.VTXID_TYPE = VTXID_INT;    // integer
+    data_t.SIZET_TYPE = SIZET_INT;    // integer
+    data_t.VALUE_TYPE = VALUE_FLOAT;  // float ranks
+
+    struct GRSetup config;     // primitive-specific configures
+    config.device    =     0;  // setting device to run
+    config.delta     = 0.85f;  // default delta value
+    config.error     = 0.01f;  // default error threshold
+    config.max_iter  =    20;  // maximum number of iterations
+
+    struct GRGraph *graph_o = (struct GRGraph*)malloc(sizeof(struct GRGraph));
+    struct GRGraph *graph_i = (struct GRGraph*)malloc(sizeof(struct GRGraph));
+
+    graph_i->num_nodes   = num_nodes;
+    graph_i->num_edges   = num_edges;
+    graph_i->row_offsets = (void*)&row_offsets[0];
+    graph_i->col_indices = (void*)&col_indices[0];
+
+    printf(" loaded %d nodes and %d edges\n", num_nodes, num_edges);
+
+    printf("-------------------- running --------------------\n");
+    gunrock_pagerank(graph_o, node_ids, pagerank, graph_i, config, data_t);
+
+    if (graph_i) free(graph_i);
+    if (graph_o) free(graph_o);
+
+    printf("------------------- completed -------------------\n");
+}
+
 // Leave this at the end of the file
 // Local Variables:
 // mode:c++
diff --git a/gunrock/app/pr/pr_enactor.cuh b/gunrock/app/pr/pr_enactor.cuh
index 423196b92..61dbedbd7 100644
--- a/gunrock/app/pr/pr_enactor.cuh
+++ b/gunrock/app/pr/pr_enactor.cuh
@@ -283,7 +283,7 @@ public:
                     if (retval = work_progress.GetQueueLength(
                             frontier_attribute.queue_index+1,
                             frontier_attribute_queue_length)) break;
-                    printf(", %d",
+                    printf(", %lld",
                            (long long) frontier_attribute_queue_length);
                 }
 
@@ -390,10 +390,9 @@ public:
             if (retval) break;
 
             // sort according to the rank values
-            util::CUBRadixSort<Value, VertexId>(
-                false, graph_slice->nodes,
-                problem->data_slices[0]->d_rank_curr,
-                problem->data_slices[0]->d_node_ids);
+            MergesortPairs(problem->data_slices[0]->d_rank_curr,
+                           problem->data_slices[0]->d_node_ids,
+                           graph_slice->nodes, mgpu::greater<Value>(), context);
 
             if (d_scanned_edges) cudaFree(d_scanned_edges);
 
diff --git a/gunrock/app/sssp/sssp_app.cu b/gunrock/app/sssp/sssp_app.cu
index fd02d0b3c..c7621b8a0 100644
--- a/gunrock/app/sssp/sssp_app.cu
+++ b/gunrock/app/sssp/sssp_app.cu
@@ -46,18 +46,18 @@ using namespace gunrock::app::sssp;
  * @param[in]  delta_factor user set
  * @param[in]  context moderngpu context
  */
-template<typename VertexId, typename Value, typename SizeT, 
+template<typename VertexId, typename Value, typename SizeT,
          bool MARK_PREDECESSORS>
 void run_sssp(
-    GRGraph        *graph_o,
-    VertexId       *predecessor,
-    const Csr<VertexId, Value, SizeT> &csr,
+    GRGraph*       graph_o,
+    VertexId*      predecessor,
+    const Csr<VertexId, Value, SizeT>& csr,
     const VertexId src,
     const int      max_grid_size,
     const float    queue_sizing,
     const int      num_gpus,
     const int      delta_factor,
-    CudaContext    &context) {
+    CudaContext&   context) {
     typedef SSSPProblem<VertexId, SizeT, Value, MARK_PREDECESSORS> Problem;
     // Allocate host-side label array for gpu-computed results
     Value *h_labels = (Value*)malloc(sizeof(Value) * csr.nodes);
@@ -75,10 +75,15 @@ void run_sssp(
     util::GRError(problem->Reset(src, enactor.GetFrontierType(), queue_sizing),
                   "SSSP Problem Data Reset Failed", __FILE__, __LINE__);
 
+    GpuTimer gpu_timer; float elapsed = 0.0f; gpu_timer.Start();  // start
+
     util::GRError(enactor.template Enact<Problem>(
                       context, problem, src, queue_sizing, max_grid_size),
                   "SSSP Problem Enact Failed", __FILE__, __LINE__);
 
+    gpu_timer.Stop(); elapsed = gpu_timer.ElapsedMillis();  // elapsed time
+    printf(" device elapsed time: %.4f ms\n", elapsed);
+
     util::GRError(problem->Extract(h_labels, predecessor),
                   "SSSP Problem Data Extraction Failed", __FILE__, __LINE__);
 
@@ -100,12 +105,12 @@ void run_sssp(
  * @param[in]  context     ModernGPU context
  */
 void dispatch_sssp(
-    GRGraph       *graph_o,
-    void          *predecessor,
-    const GRGraph *graph_i,
-    const GRSetup config,
-    const GRTypes data_t,
-    CudaContext   &context) {
+    GRGraph*       graph_o,
+    void*          predecessor,
+    const GRGraph* graph_i,
+    const GRSetup  config,
+    const GRTypes  data_t,
+    CudaContext&   context) {
     switch (data_t.VTXID_TYPE) {
     case VTXID_INT: {
         switch (data_t.SIZET_TYPE) {
@@ -289,17 +294,70 @@ void dispatch_sssp(
  * @param[in]  data_t      Data type configurations
  */
 void gunrock_sssp(
-    GRGraph       *graph_o,
-    void          *predecessor,
-    const GRGraph *graph_i,
-    const GRSetup config,
-    const GRTypes data_t) {
+    GRGraph*       graph_o,
+    void*          predecessor,
+    const GRGraph* graph_i,
+    const GRSetup  config,
+    const GRTypes  data_t) {
     unsigned int device = 0;
     device = config.device;
     ContextPtr context = mgpu::CreateCudaDevice(device);
     dispatch_sssp(graph_o, predecessor, graph_i, config, data_t, *context);
 }
 
+/*
+ * @brief Simple interface take in CSR arrays as input
+ * @param[out] distances   Return shortest distance to source per nodes
+ * @param[in]  num_nodes   Number of nodes of the input graph
+ * @param[in]  num_edges   Number of edges of the input graph
+ * @param[in]  row_offsets CSR-formatted graph input row offsets
+ * @param[in]  col_indices CSR-formatted graph input column indices
+ * @param[in]  source      Source to begin traverse
+ */
+void sssp(
+    unsigned int*       distances,
+    const int           num_nodes,
+    const int           num_edges,
+    const int*          row_offsets,
+    const int*          col_indices,
+    const unsigned int* edge_values,
+    const int           source) {
+    printf("-------------------- setting --------------------\n");
+
+    struct GRTypes data_t;           // primitive-specific data types
+    data_t.VTXID_TYPE = VTXID_INT;   // integer
+    data_t.SIZET_TYPE = SIZET_INT;   // integer
+    data_t.VALUE_TYPE = VALUE_UINT;  // unsigned integer
+
+    struct GRSetup config;          // primitive-specific configures
+    config.device      =      0;    // setting device to run
+    config.src_node    = source;    // source vertex to begin
+    config.mark_pred   =  false;    // do not mark predecessors
+    config.delta_factor =    32;    // delta factor for delta-stepping
+    config.queue_size  =   1.0f;    // maximum queue size factor
+
+    struct GRGraph *graph_o = (struct GRGraph*)malloc(sizeof(struct GRGraph));
+    struct GRGraph *graph_i = (struct GRGraph*)malloc(sizeof(struct GRGraph));
+
+    graph_i->num_nodes   = num_nodes;
+    graph_i->num_edges   = num_edges;
+    graph_i->row_offsets = (void*)&row_offsets[0];
+    graph_i->col_indices = (void*)&col_indices[0];
+    graph_i->edge_values = (void*)&edge_values[0];
+
+    printf(" loaded %d nodes and %d edges\n", num_nodes, num_edges);
+
+    printf("-------------------- running --------------------\n");
+    gunrock_sssp(graph_o, (void*)NULL, graph_i, config, data_t);
+    memcpy(distances, (unsigned int*)graph_o->node_values,
+           num_nodes * sizeof(unsigned int));
+
+    if (graph_i) free(graph_i);
+    if (graph_o) free(graph_o);
+
+    printf("------------------- completed -------------------\n");
+}
+
 // Leave this at the end of the file
 // Local Variables:
 // mode:c++
diff --git a/gunrock/gunrock.h b/gunrock/gunrock.h
index 2511701fa..24d8d421f 100644
--- a/gunrock/gunrock.h
+++ b/gunrock/gunrock.h
@@ -155,12 +155,13 @@ void gunrock_sssp(
     const struct GRTypes  data_t);
 
 void sssp(
-    int*       distances,
-    const int  num_nodes,
-    const int  num_edges,
-    const int* row_offsets,
-    const int* col_indices,
-    const int  source);
+    unsigned int*       distances,
+    const int           num_nodes,
+    const int           num_edges,
+    const int*          row_offsets,
+    const int*          col_indices,
+    const unsigned int* edge_values,
+    const int           source);
 
 // pagerank
 void gunrock_pagerank(
diff --git a/shared_lib_tests/simple_interface_test.c b/shared_lib_tests/simple_interface_test.c
index baf84ff6d..66cd14c70 100644
--- a/shared_lib_tests/simple_interface_test.c
+++ b/shared_lib_tests/simple_interface_test.c
@@ -7,36 +7,77 @@
 #include <gunrock/gunrock.h>
 
 int main(int argc, char* argv[]) {
-    int row_offsets[] = {0, 3, 6, 11, 15, 19, 23, 26};
-    int col_indices[] = {1, 2, 3, 0, 2, 4, 0, 1, 3, 4, 5, 0, 2,
-                         5, 6, 1, 2, 5, 6, 2, 3, 4, 6, 3, 4, 5};
+
+    ///////////////////////////////////////////////////////////////////////////
+    // define input graph
+    int row_offsets[] = {
+        0, 3, 6, 11, 15, 19, 23, 26};
+    int col_indices[] = {
+        1, 2, 3, 0, 2, 4, 0, 1, 3, 4, 5, 0, 2,
+        5, 6, 1, 2, 5, 6, 2, 3, 4, 6, 3, 4, 5};
+    unsigned int edge_values[] = {
+        3, 4, 5, 3, 5, 7, 4, 5, 7, 8, 9, 5, 7, 10,
+        11, 7, 8, 11, 12, 9, 10, 11, 13, 11, 12, 13};
+
+    // nodes = length of row offsets-1, edges = length of column indices
     size_t num_nodes = sizeof(row_offsets) / sizeof(row_offsets[0]) - 1;
     size_t num_edges = sizeof(col_indices) / sizeof(col_indices[0]);
 
+    ///////////////////////////////////////////////////////////////////////////
+    // allocate host arrays to store test results
+    int*   bfs_label = (  int*)malloc(sizeof(  int) * num_nodes);
+    float* bc_scores = (float*)malloc(sizeof(float) * num_nodes);
+    int*   conn_comp = (  int*)malloc(sizeof(  int) * num_nodes);
+    unsigned int *sssp_dist =
+        (unsigned int*)malloc(sizeof( unsigned int) * num_nodes);
+    int*    pr_nodes = (  int*)malloc(sizeof(  int) * num_nodes);
+    float*  pr_ranks = (float*)malloc(sizeof(float) * num_nodes);
+
+    ///////////////////////////////////////////////////////////////////////////
     printf("\n testing breath-first search ...\n");
-    int *labels = (int*)malloc(sizeof(int) * num_nodes);
-    bfs(labels, num_nodes, num_edges, row_offsets, col_indices, 0);
+    bfs(bfs_label, num_nodes, num_edges, row_offsets, col_indices, 0);
     int node; for (node = 0; node < num_nodes; ++node) {
-        printf(" node: [%d] | label (depth): [%d]\n", node, labels[node]);
+        printf(" node: [%d] | label (depth): [%d]\n", node, bfs_label[node]);
     }
 
+    ///////////////////////////////////////////////////////////////////////////
     printf("\n testing betweenness centrality ...\n");
-    float *scores = (float*)malloc(sizeof(float) * num_nodes);
-    bc(scores, num_nodes, num_edges, row_offsets, col_indices, -1);
+    bc(bc_scores, num_nodes, num_edges, row_offsets, col_indices, -1);
     for (node = 0; node < num_nodes; ++node) {
-        printf(" node: [%d] | score: [%.4f]\n", node, scores[node]);
+        printf(" node: [%d] | score: [%.4f]\n", node, bc_scores[node]);
     }
 
+    ///////////////////////////////////////////////////////////////////////////
     printf("\n testing connected components ...\n");
-    int *components = (int*)malloc(sizeof(int) * num_nodes);
-    int ret = cc(components, num_nodes, num_edges, row_offsets, col_indices);
-    printf(" total number of components: %d\n", ret);
+    int num_comp = cc(conn_comp, num_nodes, num_edges, row_offsets, col_indices);
+    printf(" total number of components: %d\n", num_comp);
+    for (node = 0; node < num_nodes; ++node) {
+        printf(" node: [%d] | component: [%d]\n", node, conn_comp[node]);
+    }
+
+    ///////////////////////////////////////////////////////////////////////////
+    printf("\n testing single-source shortest path ...\n");
+    sssp(sssp_dist, num_nodes, num_edges, row_offsets, col_indices, edge_values, 0);
     for (node = 0; node < num_nodes; ++node) {
-      printf(" node: [%d] | component: [%d]\n", node, components[node]);
+        printf(" node: [%d] | component: [%d]\n", node, sssp_dist[node]);
+    }
+
+    ///////////////////////////////////////////////////////////////////////////
+    printf("\n testing pagerank ...\n");
+    pagerank(pr_nodes, pr_ranks, num_nodes, num_edges, row_offsets, col_indices);
+    for (node = 0; node < num_nodes; ++node) {
+      printf(" node: [%d] | rank: [%.4f]\n", pr_nodes[node], pr_ranks[node]);
     }
 
     // TODO(ydwu): add other primitive tests
 
-    if (labels) { free(labels); }
+    // clean ups
+    if (bfs_label) free(bfs_label);
+    if (bc_scores) free(bc_scores);
+    if (conn_comp) free(conn_comp);
+    if (sssp_dist) free(sssp_dist);
+    if (pr_nodes)   free(pr_nodes);
+    if (pr_ranks)   free(pr_ranks);
+
     return 0;
 }

From e95765cb86ae5364e7b27fcc83d252958ff897d8 Mon Sep 17 00:00:00 2001
From: Yuduo Wu <yudwu@ucdavis.edu>
Date: Thu, 25 Jun 2015 13:51:39 -0700
Subject: [PATCH 32/36] added py samples

---
 python/betweenness_centrality.py      | 26 ++++++++++++++++++++++++
 python/breath_first_search.py         | 26 ++++++++++++++++++++++++
 python/connected_components.py        | 27 +++++++++++++++++++++++++
 python/pagerank.py                    | 29 +++++++++++++++++++++++++++
 python/single_source_shortest_path.py | 28 ++++++++++++++++++++++++++
 python/toy_graph/col.txt              | 26 ++++++++++++++++++++++++
 python/toy_graph/row.txt              |  8 ++++++++
 python/toy_graph/val.txt              | 26 ++++++++++++++++++++++++
 8 files changed, 196 insertions(+)
 create mode 100644 python/betweenness_centrality.py
 create mode 100644 python/breath_first_search.py
 create mode 100644 python/connected_components.py
 create mode 100644 python/pagerank.py
 create mode 100644 python/single_source_shortest_path.py
 create mode 100644 python/toy_graph/col.txt
 create mode 100644 python/toy_graph/row.txt
 create mode 100644 python/toy_graph/val.txt

diff --git a/python/betweenness_centrality.py b/python/betweenness_centrality.py
new file mode 100644
index 000000000..fd27df3be
--- /dev/null
+++ b/python/betweenness_centrality.py
@@ -0,0 +1,26 @@
+### sample python interface - betweenness centrality
+
+from ctypes import *
+
+### load gunrock shared library - libgunrock
+gunrock = cdll.LoadLibrary('./libgunrock.so')
+
+### read in input CSR arrays from files
+row_list = [int(x.strip()) for x in open('toy_graph/row.txt')]
+col_list = [int(x.strip()) for x in open('toy_graph/col.txt')]
+
+### convert CSR graph inputs for gunrock input
+row = pointer((c_int * len(row_list))(*row_list))
+col = pointer((c_int * len(col_list))(*col_list))
+nodes = len(row_list) - 1
+edges = len(col_list)
+
+### output array
+scores = pointer((c_float * nodes)())
+
+### call gunrock function on device
+gunrock.bc(scores, nodes, edges, row, col, 2)
+
+### sample results
+print ' node bc scores:',
+for idx in range(nodes): print scores[0][idx],
diff --git a/python/breath_first_search.py b/python/breath_first_search.py
new file mode 100644
index 000000000..93c322484
--- /dev/null
+++ b/python/breath_first_search.py
@@ -0,0 +1,26 @@
+### sample python interface - breath-first search
+
+from ctypes import *
+
+### load gunrock shared library - libgunrock
+gunrock = cdll.LoadLibrary('./libgunrock.so')
+
+### read in input CSR arrays from files
+row_list = [int(x.strip()) for x in open('toy_graph/row.txt')]
+col_list = [int(x.strip()) for x in open('toy_graph/col.txt')]
+
+### convert CSR graph inputs for gunrock input
+row = pointer((c_int * len(row_list))(*row_list))
+col = pointer((c_int * len(col_list))(*col_list))
+nodes = len(row_list) - 1
+edges = len(col_list)
+
+### output array
+labels = pointer((c_int * nodes)())
+
+### call gunrock function on device
+gunrock.bfs(labels, nodes, edges, row, col, 0)
+
+### sample results
+print ' bfs labels (depth):',
+for idx in range(nodes): print labels[0][idx],
diff --git a/python/connected_components.py b/python/connected_components.py
new file mode 100644
index 000000000..a113a36d8
--- /dev/null
+++ b/python/connected_components.py
@@ -0,0 +1,27 @@
+### sample python interface - connected components
+
+from ctypes import *
+
+### load gunrock shared library - libgunrock
+gunrock = cdll.LoadLibrary('./libgunrock.so')
+
+### read in input CSR arrays from files
+row_list = [int(x.strip()) for x in open('toy_graph/row.txt')]
+col_list = [int(x.strip()) for x in open('toy_graph/col.txt')]
+
+### convert CSR graph inputs for gunrock input
+row = pointer((c_int * len(row_list))(*row_list))
+col = pointer((c_int * len(col_list))(*col_list))
+nodes = len(row_list) - 1
+edges = len(col_list)
+
+### output array
+labels = pointer((c_int * nodes)())
+
+### call gunrock function on device
+num_components = gunrock.cc(labels, nodes, edges, row, col)
+
+### sample results
+print ' number of components: ' + str(num_components)
+print ' component ids:',
+for idx in range(nodes): print labels[0][idx],
diff --git a/python/pagerank.py b/python/pagerank.py
new file mode 100644
index 000000000..6b95a246d
--- /dev/null
+++ b/python/pagerank.py
@@ -0,0 +1,29 @@
+### sample python interface - pagerank
+
+from ctypes import *
+
+### load gunrock shared library - libgunrock
+gunrock = cdll.LoadLibrary('./libgunrock.so')
+
+### read in input CSR arrays from files
+row_list = [int(x.strip()) for x in open('toy_graph/row.txt')]
+col_list = [int(x.strip()) for x in open('toy_graph/col.txt')]
+
+### convert CSR graph inputs for gunrock input
+row = pointer((c_int * len(row_list))(*row_list))
+col = pointer((c_int * len(col_list))(*col_list))
+nodes = len(row_list) - 1
+edges = len(col_list)
+
+### output array
+node = pointer((c_int * nodes)())
+rank = pointer((c_float * nodes)())
+
+### call gunrock function on device
+gunrock.pagerank(node, rank, nodes, edges, row, col)
+
+### sample results
+print 'top page rank:'
+for idx in range(nodes):
+    print node[0][idx],
+    print rank[0][idx]
diff --git a/python/single_source_shortest_path.py b/python/single_source_shortest_path.py
new file mode 100644
index 000000000..ca67fcfc9
--- /dev/null
+++ b/python/single_source_shortest_path.py
@@ -0,0 +1,28 @@
+### sample python interface - single-source shortest path
+
+from ctypes import *
+
+### load gunrock shared library - libgunrock
+gunrock = cdll.LoadLibrary('./libgunrock.so')
+
+### read in input CSR arrays from files
+row_list = [int(x.strip()) for x in open('toy_graph/row.txt')]
+col_list = [int(x.strip()) for x in open('toy_graph/col.txt')]
+val_list = [int(x.strip()) for x in open('toy_graph/val.txt')]
+
+### convert CSR graph inputs for gunrock input
+row = pointer((c_int  * len(row_list))(*row_list))
+col = pointer((c_int  * len(col_list))(*col_list))
+val = pointer((c_uint * len(val_list))(*val_list))
+nodes = len(row_list) - 1
+edges = len(col_list)
+
+### output array
+labels = pointer((c_uint * nodes)())
+
+### call gunrock function on device
+gunrock.sssp(labels, nodes, edges, row, col, val, 0)
+
+### sample results
+print ' sssp labels (distance):',
+for idx in range(nodes): print labels[0][idx],
diff --git a/python/toy_graph/col.txt b/python/toy_graph/col.txt
new file mode 100644
index 000000000..12c10b45e
--- /dev/null
+++ b/python/toy_graph/col.txt
@@ -0,0 +1,26 @@
+1
+2
+3
+0
+2
+4
+0
+1
+3
+4
+5
+0
+2
+5
+6
+1
+2
+5
+6
+2
+3
+4
+6
+3
+4
+5
diff --git a/python/toy_graph/row.txt b/python/toy_graph/row.txt
new file mode 100644
index 000000000..1a84c1d97
--- /dev/null
+++ b/python/toy_graph/row.txt
@@ -0,0 +1,8 @@
+0
+3
+6
+11
+15
+19
+23
+26
diff --git a/python/toy_graph/val.txt b/python/toy_graph/val.txt
new file mode 100644
index 000000000..15282b913
--- /dev/null
+++ b/python/toy_graph/val.txt
@@ -0,0 +1,26 @@
+3
+4
+5
+3
+5
+7
+4
+5
+7
+8
+9
+5
+7
+10
+11
+7
+8
+11
+12
+9
+10
+11
+13
+11
+12
+13

From 93e7bb976a658e6e1683ace1d232cbe2a3dfbd2b Mon Sep 17 00:00:00 2001
From: Yuduo Wu <yudwu@ucdavis.edu>
Date: Fri, 26 Jun 2015 09:29:52 -0700
Subject: [PATCH 33/36] change lib path

---
 python/betweenness_centrality.py      | 2 +-
 python/breath_first_search.py         | 2 +-
 python/connected_components.py        | 2 +-
 python/pagerank.py                    | 2 +-
 python/single_source_shortest_path.py | 2 +-
 5 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/python/betweenness_centrality.py b/python/betweenness_centrality.py
index fd27df3be..6f7031e6b 100644
--- a/python/betweenness_centrality.py
+++ b/python/betweenness_centrality.py
@@ -3,7 +3,7 @@
 from ctypes import *
 
 ### load gunrock shared library - libgunrock
-gunrock = cdll.LoadLibrary('./libgunrock.so')
+gunrock = cdll.LoadLibrary('../../build/lib/libgunrock.so')
 
 ### read in input CSR arrays from files
 row_list = [int(x.strip()) for x in open('toy_graph/row.txt')]
diff --git a/python/breath_first_search.py b/python/breath_first_search.py
index 93c322484..b67fe80c0 100644
--- a/python/breath_first_search.py
+++ b/python/breath_first_search.py
@@ -3,7 +3,7 @@
 from ctypes import *
 
 ### load gunrock shared library - libgunrock
-gunrock = cdll.LoadLibrary('./libgunrock.so')
+gunrock = cdll.LoadLibrary('../../build/lib/libgunrock.so')
 
 ### read in input CSR arrays from files
 row_list = [int(x.strip()) for x in open('toy_graph/row.txt')]
diff --git a/python/connected_components.py b/python/connected_components.py
index a113a36d8..89fd824cb 100644
--- a/python/connected_components.py
+++ b/python/connected_components.py
@@ -3,7 +3,7 @@
 from ctypes import *
 
 ### load gunrock shared library - libgunrock
-gunrock = cdll.LoadLibrary('./libgunrock.so')
+gunrock = cdll.LoadLibrary('../../build/lib/libgunrock.so')
 
 ### read in input CSR arrays from files
 row_list = [int(x.strip()) for x in open('toy_graph/row.txt')]
diff --git a/python/pagerank.py b/python/pagerank.py
index 6b95a246d..642fa2e12 100644
--- a/python/pagerank.py
+++ b/python/pagerank.py
@@ -3,7 +3,7 @@
 from ctypes import *
 
 ### load gunrock shared library - libgunrock
-gunrock = cdll.LoadLibrary('./libgunrock.so')
+gunrock = cdll.LoadLibrary('../../build/lib/libgunrock.so')
 
 ### read in input CSR arrays from files
 row_list = [int(x.strip()) for x in open('toy_graph/row.txt')]
diff --git a/python/single_source_shortest_path.py b/python/single_source_shortest_path.py
index ca67fcfc9..69edc2b39 100644
--- a/python/single_source_shortest_path.py
+++ b/python/single_source_shortest_path.py
@@ -3,7 +3,7 @@
 from ctypes import *
 
 ### load gunrock shared library - libgunrock
-gunrock = cdll.LoadLibrary('./libgunrock.so')
+gunrock = cdll.LoadLibrary('../../build/lib/libgunrock.so')
 
 ### read in input CSR arrays from files
 row_list = [int(x.strip()) for x in open('toy_graph/row.txt')]

From fdeb7fbd855f16045c412c8f2becebcdc1cc460d Mon Sep 17 00:00:00 2001
From: Yuduo Wu <yudwu@ucdavis.edu>
Date: Fri, 26 Jun 2015 13:17:53 -0400
Subject: [PATCH 34/36] Update betweenness_centrality.py

---
 python/betweenness_centrality.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/betweenness_centrality.py b/python/betweenness_centrality.py
index 6f7031e6b..e7978d04f 100644
--- a/python/betweenness_centrality.py
+++ b/python/betweenness_centrality.py
@@ -19,7 +19,7 @@
 scores = pointer((c_float * nodes)())
 
 ### call gunrock function on device
-gunrock.bc(scores, nodes, edges, row, col, 2)
+gunrock.bc(scores, nodes, edges, row, col, -1)
 
 ### sample results
 print ' node bc scores:',

From fe385efbf17f97922fe76f72a0c0ba42155d8d26 Mon Sep 17 00:00:00 2001
From: Yangzihao Wang <yzhwang@ucdavis.edu>
Date: Fri, 26 Jun 2015 10:43:29 -0700
Subject: [PATCH 35/36] only display the first incorrect value, count for
 others, but do not display. Added WriteToLigraFile function to form Ligra
 adjlist input file. make LB the default traversal mode in SSSP.

---
 gunrock/csr.cuh             | 23 +++++++++++++++++++++++
 gunrock/util/test_utils.cuh |  3 +--
 tests/sssp/ppopp-test.sh    |  4 ++--
 tests/sssp/test_sssp.cu     |  4 ++--
 4 files changed, 28 insertions(+), 6 deletions(-)

diff --git a/gunrock/csr.cuh b/gunrock/csr.cuh
index 0c2f46373..81f2c50e7 100644
--- a/gunrock/csr.cuh
+++ b/gunrock/csr.cuh
@@ -148,6 +148,27 @@ struct Csr {
         }
     }
 
+    void WriteToLigraFile(char  *file_name, SizeT v, SizeT e, SizeT *row,
+                     VertexId *col, Value *edge_values = NULL) {
+        char adj_name[256];
+        sprintf(adj_name, "%s.adj", file_name);
+        printf("writing to ligra .adj file.\n");
+
+        std::ofstream fout3(adj_name);
+        if (fout3.is_open()) {
+            fout3 << v << " " << v << " " << e << std::endl;
+            for (int i = 0; i < v; ++i)
+                fout3 << row[i] << std::endl;
+            for (int i = 0; i < e; ++i)
+                fout3 << col[i] << std::endl;
+            if (edge_values != NULL) {
+                for (int i = 0; i < e; ++i)
+                    fout3 << edge_values[i] << std::endl;
+            }
+            fout3.close();
+        }
+    }
+
     /**
      *
      * @brief Read from stored row_offsets, column_indices arrays
@@ -266,6 +287,8 @@ struct Csr {
         if (LOAD_EDGE_VALUES) {
             WriteToFile(output_file, nodes, edges,
                         row_offsets, column_indices, edge_values);
+            WriteToLigraFile(output_file, nodes, edges,
+                        row_offsets, column_indices, edge_values);
         } else {
             WriteToFile(output_file, nodes, edges,
                         row_offsets, column_indices);
diff --git a/gunrock/util/test_utils.cuh b/gunrock/util/test_utils.cuh
index 11eff1ddb..7169c4e50 100644
--- a/gunrock/util/test_utils.cuh
+++ b/gunrock/util/test_utils.cuh
@@ -472,9 +472,8 @@ int CompareResults(
                 printf("...]");
             }
             flag += 1;
-            //return flag;
         }
-        //if (!is_right && flag > 0) flag += 1;
+        if (!is_right && flag > 0) flag += 1;
     }
     printf("\n");
     if (!flag)
diff --git a/tests/sssp/ppopp-test.sh b/tests/sssp/ppopp-test.sh
index cbc55562d..8934de90b 100644
--- a/tests/sssp/ppopp-test.sh
+++ b/tests/sssp/ppopp-test.sh
@@ -1,7 +1,7 @@
 mkdir -p eval/PPOPP15
 for i in  1-soc 2-bitcoin 3-kron 6-roadnet
 do
-    echo ./bin/test_sssp_6.5_x86_64 market /data/PPOPP15/$i.mtx --src=0 --undirected --iteration-num=10
-         ./bin/test_sssp_6.5_x86_64 market /data/PPOPP15/$i.mtx --src=0 --undirected --iteration-num=10 --delta-factor=32 > eval/PPOPP15/$i.txt
+    echo ./bin/test_sssp_7.0_x86_64 market /data/PPOPP15/$i.mtx --src=0 --undirected --iteration-num=10
+         ./bin/test_sssp_7.0_x86_64 market /data/PPOPP15/$i.mtx --src=0 --undirected --iteration-num=10 --delta-factor=32 > eval/PPOPP15/$i.txt
     sleep 1
 done
diff --git a/tests/sssp/test_sssp.cu b/tests/sssp/test_sssp.cu
index 2bc204495..200c668a3 100644
--- a/tests/sssp/test_sssp.cu
+++ b/tests/sssp/test_sssp.cu
@@ -566,7 +566,7 @@ void RunTests(
     args.GetCmdLineArgument("traversal-mode", traversal_mode);
     if (traversal_mode == -1)
     {
-        traversal_mode = graph.GetAverageDegree() > 8 ? 0 : 1;
+        traversal_mode = 0;
     }
 
     instrumented = args.CheckCmdLineFlag("instrumented");
@@ -687,7 +687,7 @@ int main( int argc, char** argv)
         }
 
         csr.PrintHistogram();
-        csr.DisplayGraph(true); //print graph with edge_value
+        //csr.DisplayGraph(true); //print graph with edge_value
         //csr.GetAverageEdgeValue();
         //csr.GetAverageDegree();
         //int max_degree;

From 57842ba66aa9eb6b53a91e87c594b182f29b2d52 Mon Sep 17 00:00:00 2001
From: Yuduo Wu <yudwu@ucdavis.edu>
Date: Fri, 26 Jun 2015 14:47:06 -0400
Subject: [PATCH 36/36] Delete FAQ.markdown

---
 FAQ.markdown | 135 ---------------------------------------------------
 1 file changed, 135 deletions(-)
 delete mode 100644 FAQ.markdown

diff --git a/FAQ.markdown b/FAQ.markdown
deleted file mode 100644
index 1af4bb1a7..000000000
--- a/FAQ.markdown
+++ /dev/null
@@ -1,135 +0,0 @@
-Gunrock FAQ
-===========
-
-What does it do?
-----------------
-
-Gunrock is a fast-and-efficient graph processing library on the GPU which
-provides a set of graph algorithms used in big data analytics and visualization
-with high performance.  It also provides a set of operators which abstract the
-general operations in graph processing for other developers to build
-high-performance graph algorithm prototyes with minimum programming effort.
-
-How does it do it?
------------------
-
-Gunrock takes advantage of the immense computational power available in
-commodity-level, off-the-shelf Graphics Processing Units (GPUs), originally
-designed to handle the parallel computational tasks in computer graphics, to
-perform graph traversal and computation in parallel on thousands of GPU's
-computing cores.
-
-Who should want this?
----------------------
-
-Gunrock is built with two kinds of users in mind: The first kind of users are
-programmers who build big graph analytics and visualization project and need to
-use existing graph primitives provided by Gunrock.  The second kind of users
-are programmers who want to use Gunrock's high-level, programmable abstraction
-to express, develop, and refine their own (and often more complicated) graph
-primitives.
-    
-What is the skill set users need to use it?
--------------------------------------------
-
-for the first kind of users, C/C++ background is sufficient. We are also
-building Gunrock as a shared library with C interfaces which can be loaded by
-other languages such as Python and Julia.  for the second kind of users, they
-need to have the C/C++ background and also an understanding of parallel
-programming, especially BSP (Bulk-Synchronous Programming) model which Gunrock
-uses.
-
-What platforms/languages do people need to know in order to modify or integrate it with other tools?
-----------------------------------------------------------------------------------------------------
-
-Using the exposed interface, the users do not need to know CUDA or OpenCL to
-modify or integrate Gunrock to their own tools. However, an essential
-understanding of parallel programming and BSP model is necessary if one wants
-to add/modify graph primitives in Gunrock.
-
-Why would someone want this?
-----------------------------
-
-The study of social networks, webgraphs, biological networks, and unstructured
-meshes in scientific simulation has raised a significant demand for efficient
-parallel frameworks for processing and analytics on large-scale graphs. Initial
-research efforts in using GPUs for graph processing and anlytics are promising.
-
-How is it better than the current state of the art?
----------------------------------------------------
-
-Most existing CPU large graph processing libraries perform worse on large
-graphs with billions of edges. Supercomputer or expensive clusters can achieve
-close to real-time feedback with high cost on hardware infrastructure. With
-GPUs, we can achieve the same real-time feedback with much lower cost on
-hardware. Gunrock has the best performance among the limited research efforts
-put on GPU graph processing. With a set of general graph processing operators
-exposed to users, it is also more flexible than other GPU/CPU graph library in
-terms of programmability.
-
-How would someone get it?
--------------------------
-
-Gunrock is an open-sourced library. The code, documentation, and quick start
-guide are all on its [github page](gunrock.github.io).
-    
-Is a user account required?
----------------------------
-
-No. One can use either git clone or download directly to get the source code
-and documentation of Gunrock.
-
-Are all of its components/dependencies easy to find?
-----------------------------------------------------
-
-Gunrock has three dependencies. Two of them are also GPU primitive library which
-also reside on github. The third one is Boost (Gunrock uses Boost Graph Library
-to implement CPU reference testing algorithms). All dependencies do not require
-installation. To use, one only needs to download or git clone them and put them
-in the according directories. More details in the installation section of this
-documentation.
-
-How would someone install it?
------------------------------
-
-For C/C++ programmer, integrating Gunrock into your projects is easy. Since it
-is a template based library, just add the include files in your code. The
-simple example and all the testrigs will provide detailed information on how to
-do this.
-
-For programmers who use Python, Julia, or other language and want to call
-Gunrock APIs, we are building a shared library with binary compatible
-C interfaces. It will be included in the soon-to-arrive next release of
-Gunrock.
-
-Can anyone install it? Do they need IT help?
---------------------------------------------
-
-Gunrock is targeted at developers who are familiar with basic software
-engineering. For non-technical people, IT help might needed.
-
-Does this process actually work? All the time? On all systems specified?
-------------------------------------------------------------------------
-Currently, Gunrock has been tested on two Linux distributions: Linux Mint and
-Ubuntu. But we expect it to run correctly on other Linux distributions too.
-We are currently building a Cmake solution to port Gunrock to Mac and Windows.
-The feature will be included in the soon-to-arrive next release of Gunrock.
-
-How would someone test that it's working with provided sample data?
--------------------------------------------------------------------
-
-Testrigs are provided as well as a small simple example for users to test the
-correctness and performance of every graph primitive. 
-
-Is the "using" of sample data clear?
-------------------------------------
-
-On linux, one only needs to go to the dataset directory and run "make", the
-script will automatically download all the needed datasets. One can also choose
-to download a single dataset in its separated directory.
-
-How would someone use it with their own data?
----------------------------------------------
-
-Gunrock supports Matrix Market (.mtx) file format, users need to pre-process
-the graph data into this format before running Gunrock.