diff --git a/CMakeLists.txt b/CMakeLists.txt
index 82a315c20..a58744b37 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -193,7 +193,7 @@ add_test(NAME TestSSSP COMMAND test_sssp)
 set_tests_properties(TestSSSP
   PROPERTIES PASS_REGULAR_EXPRESSION "Node ID.*1.*: Label.*39.*: Predecessor.*0")
 
-add_test(NAME TestPR COMMAND test_pr --undirected)
+add_test(NAME TestPR COMMAND test_pr)
 set_tests_properties(TestPR
   PROPERTIES PASS_REGULAR_EXPRESSION "Node ID.*2.*: Page Rank.*0.357069.")
 
diff --git a/FAQ.markdown b/FAQ.markdown
deleted file mode 100644
index 1af4bb1a7..000000000
--- a/FAQ.markdown
+++ /dev/null
@@ -1,135 +0,0 @@
-Gunrock FAQ
-===========
-
-What does it do?
-----------------
-
-Gunrock is a fast-and-efficient graph processing library on the GPU which
-provides a set of graph algorithms used in big data analytics and visualization
-with high performance.  It also provides a set of operators which abstract the
-general operations in graph processing for other developers to build
-high-performance graph algorithm prototyes with minimum programming effort.
-
-How does it do it?
------------------
-
-Gunrock takes advantage of the immense computational power available in
-commodity-level, off-the-shelf Graphics Processing Units (GPUs), originally
-designed to handle the parallel computational tasks in computer graphics, to
-perform graph traversal and computation in parallel on thousands of GPU's
-computing cores.
-
-Who should want this?
----------------------
-
-Gunrock is built with two kinds of users in mind: The first kind of users are
-programmers who build big graph analytics and visualization project and need to
-use existing graph primitives provided by Gunrock.  The second kind of users
-are programmers who want to use Gunrock's high-level, programmable abstraction
-to express, develop, and refine their own (and often more complicated) graph
-primitives.
-    
-What is the skill set users need to use it?
--------------------------------------------
-
-for the first kind of users, C/C++ background is sufficient. We are also
-building Gunrock as a shared library with C interfaces which can be loaded by
-other languages such as Python and Julia.  for the second kind of users, they
-need to have the C/C++ background and also an understanding of parallel
-programming, especially BSP (Bulk-Synchronous Programming) model which Gunrock
-uses.
-
-What platforms/languages do people need to know in order to modify or integrate it with other tools?
-----------------------------------------------------------------------------------------------------
-
-Using the exposed interface, the users do not need to know CUDA or OpenCL to
-modify or integrate Gunrock to their own tools. However, an essential
-understanding of parallel programming and BSP model is necessary if one wants
-to add/modify graph primitives in Gunrock.
-
-Why would someone want this?
-----------------------------
-
-The study of social networks, webgraphs, biological networks, and unstructured
-meshes in scientific simulation has raised a significant demand for efficient
-parallel frameworks for processing and analytics on large-scale graphs. Initial
-research efforts in using GPUs for graph processing and anlytics are promising.
-
-How is it better than the current state of the art?
----------------------------------------------------
-
-Most existing CPU large graph processing libraries perform worse on large
-graphs with billions of edges. Supercomputer or expensive clusters can achieve
-close to real-time feedback with high cost on hardware infrastructure. With
-GPUs, we can achieve the same real-time feedback with much lower cost on
-hardware. Gunrock has the best performance among the limited research efforts
-put on GPU graph processing. With a set of general graph processing operators
-exposed to users, it is also more flexible than other GPU/CPU graph library in
-terms of programmability.
-
-How would someone get it?
--------------------------
-
-Gunrock is an open-sourced library. The code, documentation, and quick start
-guide are all on its [github page](gunrock.github.io).
-    
-Is a user account required?
----------------------------
-
-No. One can use either git clone or download directly to get the source code
-and documentation of Gunrock.
-
-Are all of its components/dependencies easy to find?
-----------------------------------------------------
-
-Gunrock has three dependencies. Two of them are also GPU primitive library which
-also reside on github. The third one is Boost (Gunrock uses Boost Graph Library
-to implement CPU reference testing algorithms). All dependencies do not require
-installation. To use, one only needs to download or git clone them and put them
-in the according directories. More details in the installation section of this
-documentation.
-
-How would someone install it?
------------------------------
-
-For C/C++ programmer, integrating Gunrock into your projects is easy. Since it
-is a template based library, just add the include files in your code. The
-simple example and all the testrigs will provide detailed information on how to
-do this.
-
-For programmers who use Python, Julia, or other language and want to call
-Gunrock APIs, we are building a shared library with binary compatible
-C interfaces. It will be included in the soon-to-arrive next release of
-Gunrock.
-
-Can anyone install it? Do they need IT help?
---------------------------------------------
-
-Gunrock is targeted at developers who are familiar with basic software
-engineering. For non-technical people, IT help might needed.
-
-Does this process actually work? All the time? On all systems specified?
-------------------------------------------------------------------------
-Currently, Gunrock has been tested on two Linux distributions: Linux Mint and
-Ubuntu. But we expect it to run correctly on other Linux distributions too.
-We are currently building a Cmake solution to port Gunrock to Mac and Windows.
-The feature will be included in the soon-to-arrive next release of Gunrock.
-
-How would someone test that it's working with provided sample data?
--------------------------------------------------------------------
-
-Testrigs are provided as well as a small simple example for users to test the
-correctness and performance of every graph primitive. 
-
-Is the "using" of sample data clear?
-------------------------------------
-
-On linux, one only needs to go to the dataset directory and run "make", the
-script will automatically download all the needed datasets. One can also choose
-to download a single dataset in its separated directory.
-
-How would someone use it with their own data?
----------------------------------------------
-
-Gunrock supports Matrix Market (.mtx) file format, users need to pre-process
-the graph data into this format before running Gunrock.
diff --git a/dataset/small/test_mst.mtx b/dataset/small/test_mst.mtx
new file mode 100644
index 000000000..a6949843f
--- /dev/null
+++ b/dataset/small/test_mst.mtx
@@ -0,0 +1,18 @@
+9 9 17
+1 2 2
+2 3 2
+2 4 17
+3 1 2
+3 4 38
+3 5 10
+4 5 2
+5 1 82
+5 2 11
+6 3 100
+6 4 100
+6 5 210
+6 7 2
+6 8 21
+7 3 120
+7 5 110
+8 9 2
\ No newline at end of file
diff --git a/gunrock/CMakeLists.txt b/gunrock/CMakeLists.txt
index 625225964..0f1d8d6ef 100644
--- a/gunrock/CMakeLists.txt
+++ b/gunrock/CMakeLists.txt
@@ -26,6 +26,7 @@ set(CUFILES
   app/cc/cc_app.cu
   app/sssp/sssp_app.cu
   app/pr/pr_app.cu
+  app/mst/mst_app.cu
   util/test_utils.cu
   util/error_utils.cu
   ${mgpu_SOURCE_FILES})
diff --git a/gunrock/app/bc/bc_app.cu b/gunrock/app/bc/bc_app.cu
index fb70e9d11..353d107ba 100644
--- a/gunrock/app/bc/bc_app.cu
+++ b/gunrock/app/bc/bc_app.cu
@@ -8,16 +8,15 @@
 /**
  * @file bc_app.cu
  *
- * @brief Gunrock Betweeness Centrality Implementation
+ * @brief Gunrock betweeness centrality (BC) application
  */
 
-#include <stdio.h>
 #include <gunrock/gunrock.h>
 
-// Graph construction utils
+// graph construction utilities
 #include <gunrock/graphio/market.cuh>
 
-// BC includes
+// betweeness centrality includes
 #include <gunrock/app/bc/bc_enactor.cuh>
 #include <gunrock/app/bc/bc_problem.cuh>
 #include <gunrock/app/bc/bc_functor.cuh>
@@ -36,150 +35,117 @@ using namespace gunrock::app::bc;
  * @tparam Value
  * @tparam SizeT
  *
- * @param[out] ggraph_out Pointer to the output CSR graph object
- * @param[in] graph Reference to the CSR graph object defined in main driver
+ * @param[out] graph_o Pointer to the output CSR graph object
+ * @param[in] csr Reference to the CSR graph object defined in main driver
  * @param[in] source
  * @param[in] max_grid_size
  * @param[in] num_gpus
  * @param[in] max_queue_sizing
  * @param[in] context Reference to CudaContext used by moderngpu functions
  */
-template <
-    typename VertexId,
-    typename Value,
-    typename SizeT >
+template<typename VertexId, typename Value, typename SizeT>
 void run_bc(
-    GunrockGraph *ggraph_out,
-    const Csr<VertexId, Value, SizeT> &graph,
-    VertexId source,
-    int      max_grid_size,
-    int      num_gpus,
-    double   max_queue_sizing,
-    CudaContext& context) {
-    typedef BCProblem <
-        VertexId,
-        SizeT,
-        Value,
-        true, // MARK_PREDECESSORS
-        false > Problem; //does not use double buffer
-
+    GRGraph*       graph_o,
+    const Csr<VertexId, Value, SizeT>& csr,
+    const VertexId source,
+    const int      max_grid_size,
+    const int      num_gpus,
+    const double   max_queue_sizing,
+    CudaContext&   context) {
+    typedef BCProblem<VertexId, SizeT, Value, true, false > Problem;
     // Allocate host-side array (for both reference and gpu-computed results)
-    Value *h_sigmas     = (Value*)malloc(sizeof(Value) * graph.nodes);
-    Value *h_bc_values  = (Value*)malloc(sizeof(Value) * graph.nodes);
-    Value *h_ebc_values = (Value*)malloc(sizeof(Value) * graph.edges);
-
-    // Allocate BC enactor map
-    BCEnactor<false> bc_enactor(false);
-
-    // Allocate problem on GPU
-    Problem *csr_problem = new Problem;
-    util::GRError(csr_problem->Init(
-                      false,
-                      graph,
-                      num_gpus),
+    Value *h_sigmas     = (Value*)malloc(sizeof(Value) * csr.nodes);
+    Value *h_bc_values  = (Value*)malloc(sizeof(Value) * csr.nodes);
+    Value *h_ebc_values = (Value*)malloc(sizeof(Value) * csr.edges);
+    BCEnactor<false> enactor(false);  // Allocate BC enactor map
+    Problem *problem = new Problem;   // Allocate problem on GPU
+
+    util::GRError(problem->Init(false, csr, num_gpus),
                   "BC Problem Initialization Failed", __FILE__, __LINE__);
 
-    // Perform BC
-    GpuTimer gpu_timer;
+    GpuTimer gpu_timer; float elapsed = 0.0f; gpu_timer.Start();  // start
 
     VertexId start_source;
     VertexId end_source;
     if (source == -1) {
         start_source = 0;
-        end_source = graph.nodes;
+        end_source = csr.nodes;
     } else {
         start_source = source;
         end_source = source + 1;
     }
 
-    gpu_timer.Start();
     for (VertexId i = start_source; i < end_source; ++i) {
-        util::GRError(csr_problem->Reset(
-                          i, bc_enactor.GetFrontierType(), max_queue_sizing),
+        util::GRError(problem->Reset(
+                          i, enactor.GetFrontierType(), max_queue_sizing),
                       "BC Problem Data Reset Failed", __FILE__, __LINE__);
-        util::GRError(bc_enactor.template Enact<Problem>(
-                          context, csr_problem, i, max_grid_size),
+        util::GRError(enactor.template Enact<Problem>(
+                          context, problem, i, max_grid_size),
                       "BC Problem Enact Failed", __FILE__, __LINE__);
     }
 
     util::MemsetScaleKernel <<< 128, 128>>>(
-        csr_problem->data_slices[0]->d_bc_values, (Value)0.5f, (int)graph.nodes);
-
-    gpu_timer.Stop();
+        problem->data_slices[0]->d_bc_values, (Value)0.5f, (int)csr.nodes);
 
-    float elapsed = gpu_timer.ElapsedMillis();
+    gpu_timer.Stop(); elapsed = gpu_timer.ElapsedMillis();  // elapsed time
+    printf(" device elapsed time: %.4f ms\n", elapsed);
 
-    //double avg_duty = 0.0;
-    //bc_enactor.GetStatistics(avg_duty);
-
-    // Copy out results to Host Device
-    util::GRError(csr_problem->Extract(h_sigmas, h_bc_values, h_ebc_values),
+    util::GRError(problem->Extract(h_sigmas, h_bc_values, h_ebc_values),
                   "BC Problem Data Extraction Failed", __FILE__, __LINE__);
 
-    // copy h_bc_values per node to GunrockGraph output
-    ggraph_out->node_values = (float*)&h_bc_values[0];
-    // copy h_ebc_values per edge to GunrockGraph output
-    ggraph_out->edge_values = (float*)&h_ebc_values[0];
-
-    printf("GPU Betweeness Centrality finished in %lf msec.\n", elapsed);
-
-    // Cleanup
-    if (csr_problem) delete csr_problem;
-    //if (h_sigmas) free(h_sigmas);
-    //if (h_bc_values) free(h_bc_values);
+    graph_o->node_values = (float*)&h_bc_values[0];   // h_bc_values per node
+    graph_o->edge_values = (float*)&h_ebc_values[0];  // h_ebc_values per edge
 
+    if (problem) { delete problem; }
     cudaDeviceSynchronize();
 }
 
 /**
  * @brief dispatch function to handle data_types
  *
- * @param[out] ggraph_out GunrockGraph type output
- * @param[in]  ggraph_in  GunrockGraph type input graph
- * @param[in]  bc_config  bc specific configurations
- * @param[in]  data_type  bc data_type configurations
- * @param[in]  context    moderngpu context
+ * @param[out] graph_o  GRGraph type output
+ * @param[in]  graph_i  GRGraph type input graph
+ * @param[in]  config   Specific configurations
+ * @param[in]  data_t   Data type configurations
+ * @param[in]  context  ModernGPU context
  */
 void dispatch_bc(
-    GunrockGraph       *ggraph_out,
-    const GunrockGraph *ggraph_in,
-    GunrockConfig      bc_config,
-    GunrockDataType    data_type,
-    CudaContext&       context) {
-    switch (data_type.VTXID_TYPE) {
+    GRGraph       *graph_o,
+    const GRGraph *graph_i,
+    const GRSetup  config,
+    const GRTypes  data_t,
+    CudaContext   &context) {
+    switch (data_t.VTXID_TYPE) {
     case VTXID_INT: {
-        switch (data_type.SIZET_TYPE) {
+        switch (data_t.SIZET_TYPE) {
         case SIZET_INT: {
-            switch (data_type.VALUE_TYPE) {
-            case VALUE_INT: {
-                // template type = <int, int, int>
+            switch (data_t.VALUE_TYPE) {
+            case VALUE_INT: {  // template type = <int, int, int>
                 // not support yet
                 printf("Not Yet Support This DataType Combination.\n");
                 break;
             }
-            case VALUE_UINT: {
-                // template type = <int, uint, int>
+            case VALUE_UINT: {  // template type = <int, uint, int>
                 // not support yet
                 printf("Not Yet Support This DataType Combination.\n");
                 break;
             }
-            case VALUE_FLOAT: {
-                // template type = <int, float, int>
+            case VALUE_FLOAT: {  // template type = <int, float, int>
                 // build input csr format graph
                 Csr<int, float, int> csr_graph(false);
-                csr_graph.nodes = ggraph_in->num_nodes;
-                csr_graph.edges = ggraph_in->num_edges;
-                csr_graph.row_offsets    = (int*)ggraph_in->row_offsets;
-                csr_graph.column_indices = (int*)ggraph_in->col_indices;
+                csr_graph.nodes = graph_i->num_nodes;
+                csr_graph.edges = graph_i->num_edges;
+                csr_graph.row_offsets    = (int*)graph_i->row_offsets;
+                csr_graph.column_indices = (int*)graph_i->col_indices;
 
                 // bc configurations
-                int   src_node         =  -1; //!< Use whatever the specified graph-type's default is
-                int   max_grid_size    =   0; //!< maximum grid size (0: leave it up to the enactor)
-                int   num_gpus         =   1; //!< Number of GPUs for multi-gpu enactor to use
-                float max_queue_sizing = 1.0; //!< Maximum size scaling factor for work queues
+                int   src_node         =  -1;  // default source vertex to start
+                int   max_grid_size    =   0;  // leave it up to the enactor
+                int   num_gpus         =   1;  // Number of GPUs for multi-gpu
+                float max_queue_sizing = 1.0;  // Maximum size scaling factor
 
                 // determine source vertex to start bc
-                switch (bc_config.src_mode) {
+                switch (config.src_mode) {
                 case randomize: {
                     src_node = graphio::RandomNode(csr_graph.nodes);
                     break;
@@ -190,7 +156,7 @@ void dispatch_bc(
                     break;
                 }
                 case manually: {
-                    src_node = bc_config.src_node;
+                    src_node = config.src_node;
                     break;
                 }
                 default: {
@@ -198,11 +164,11 @@ void dispatch_bc(
                     break;
                 }
                 }
-                max_queue_sizing = bc_config.queue_size;
+                max_queue_sizing = config.queue_size;
 
                 // lunch bc function
                 run_bc<int, float, int>(
-                    ggraph_out,
+                    graph_o,
                     csr_graph,
                     src_node,
                     max_grid_size,
@@ -227,29 +193,68 @@ void dispatch_bc(
 /*
  * @brief gunrock_bc function
  *
- * @param[out] ggraph_out output of bc problem
- * @param[in]  ggraph_in  input graph need to process on
- * @param[in]  bc_config  gunrock primitive specific configurations
- * @param[in]  data_type  gunrock datatype struct
+ * @param[out] graph_o output of bc problem
+ * @param[in]  graph_i input graph need to process on
+ * @param[in]  config  gunrock primitive specific configurations
+ * @param[in]  data_t  gunrock data_t struct
  */
-void gunrock_bc_func(
-    GunrockGraph       *ggraph_out,
-    const GunrockGraph *ggraph_in,
-    GunrockConfig      bc_config,
-    GunrockDataType    data_type) {
-
-    // moderngpu preparations
-    int device = 0;
-    device = bc_config.device;
+void gunrock_bc(
+    GRGraph       *graph_o,
+    const GRGraph *graph_i,
+    const GRSetup  config,
+    const GRTypes  data_t) {
+    unsigned int device = 0;
+    device = config.device;
     ContextPtr context = mgpu::CreateCudaDevice(device);
+    dispatch_bc(graph_o, graph_i, config, data_t, *context);
+}
+
+/*
+ * @brief Simple interface take in CSR arrays as input
+ * @param[out] bfs_label   Return BC node centrality per nodes
+ * @param[in]  num_nodes   Number of nodes of the input graph
+ * @param[in]  num_edges   Number of edges of the input graph
+ * @param[in]  row_offsets CSR-formatted graph input row offsets
+ * @param[in]  col_indices CSR-formatted graph input column indices
+ * @param[in]  source      Source to begin traverse
+ */
+void bc(
+    float*     bc_scores,
+    const int  num_nodes,
+    const int  num_edges,
+    const int* row_offsets,
+    const int* col_indices,
+    const int  source) {
+    printf("-------------------- setting --------------------\n");
+
+    struct GRTypes data_t;            // primitive-specific data types
+    data_t.VTXID_TYPE = VTXID_INT;    // integer
+    data_t.SIZET_TYPE = SIZET_INT;    // integer
+    data_t.VALUE_TYPE = VALUE_FLOAT;  // float BC scores
+
+    struct GRSetup config;          // primitive-specific configures
+    config.device      =      0;    // setting device to run
+    config.src_node    = source;    // source vertex to begin
+    config.queue_size  =   1.0f;    // maximum queue size factor
+
+    struct GRGraph *graph_o = (struct GRGraph*)malloc(sizeof(struct GRGraph));
+    struct GRGraph *graph_i = (struct GRGraph*)malloc(sizeof(struct GRGraph));
+
+    graph_i->num_nodes   = num_nodes;
+    graph_i->num_edges   = num_edges;
+    graph_i->row_offsets = (void*)&row_offsets[0];
+    graph_i->col_indices = (void*)&col_indices[0];
+
+    printf(" loaded %d nodes and %d edges\n", num_nodes, num_edges);
+
+    printf("-------------------- running --------------------\n");
+    gunrock_bc(graph_o, graph_i, config, data_t);
+    memcpy(bc_scores, (float*)graph_o->node_values, num_nodes * sizeof(float));
+
+    if (graph_i) free(graph_i);
+    if (graph_o) free(graph_o);
 
-    // lunch dispatch function
-    dispatch_bc(
-        ggraph_out,
-        ggraph_in,
-        bc_config,
-        data_type,
-        *context);
+    printf("------------------- completed -------------------\n");
 }
 
 // Leave this at the end of the file
diff --git a/gunrock/app/bc/bc_enactor.cuh b/gunrock/app/bc/bc_enactor.cuh
index 59b1cf136..65f5f7e0c 100644
--- a/gunrock/app/bc/bc_enactor.cuh
+++ b/gunrock/app/bc/bc_enactor.cuh
@@ -988,7 +988,7 @@ public:
         Problem,                            // Problem data type
         300,                                // CUDA_ARCH
         INSTRUMENT,                         // INSTRUMENT
-        8,                                  // MIN_CTA_OCCUPANCY
+        1,                                  // MIN_CTA_OCCUPANCY
         10,                                 // LOG_THREADS
         8,                                  // LOG_BLOCKS
         32*128,                             // LIGHT_EDGE_THRESHOLD (used for partitioned advance mode)
diff --git a/gunrock/app/bc/bc_functor.cuh b/gunrock/app/bc/bc_functor.cuh
index 00c9d2f1e..3ec577ac3 100644
--- a/gunrock/app/bc/bc_functor.cuh
+++ b/gunrock/app/bc/bc_functor.cuh
@@ -161,7 +161,6 @@ struct BackwardFunctor
      */
     static __device__ __forceinline__ bool CondEdge(VertexId s_id, VertexId d_id, DataSlice *problem, VertexId e_id = 0, VertexId e_id_in = 0)
     {
-
         VertexId s_label;
         VertexId d_label;
         util::io::ModifiedLoad<ProblemData::COLUMN_READ_MODIFIER>::Ld(
@@ -304,7 +303,7 @@ struct BackwardFunctor2
         util::io::ModifiedLoad<ProblemData::COLUMN_READ_MODIFIER>::Ld(
             to_delta, problem->deltas + d_id);
 
-        //Value result = from_sigma / to_sigma * (1.0 + to_delta);
+        Value result = from_sigma / to_sigma * (1.0 + to_delta);
 
         //Accumulate delta value
 
diff --git a/gunrock/app/bfs/bfs_app.cu b/gunrock/app/bfs/bfs_app.cu
index 026c7d6fc..b6313fffa 100644
--- a/gunrock/app/bfs/bfs_app.cu
+++ b/gunrock/app/bfs/bfs_app.cu
@@ -8,21 +8,19 @@
 /**
  * @file bfs_app.cu
  *
- * @brief Gunrock Breadth-First Search implementation
+ * @brief Gunrock breadth-first search (BFS) application
  */
 
-#include <stdio.h>
 #include <gunrock/gunrock.h>
 
-// Graph construction utils
+// graph construction utilities
 #include <gunrock/graphio/market.cuh>
 
-// BFS includes
+// breadth-first search includes
 #include <gunrock/app/bfs/bfs_enactor.cuh>
 #include <gunrock/app/bfs/bfs_problem.cuh>
 #include <gunrock/app/bfs/bfs_functor.cuh>
 
-// MGPU include
 #include <moderngpu.cuh>
 
 using namespace gunrock;
@@ -39,8 +37,8 @@ using namespace gunrock::app::bfs;
  * @tparam MARK_PREDECESSORS
  * @tparam ENABLE_IDEMPOTENCE
  *
- * @param[out] ggraph_out Pointer to the output CSR graph
- * @param[in] ggraph_in Reference to the CSR graph we process on
+ * @param[out] graph_o Pointer to the output CSR graph
+ * @param[in] graph_i Reference to the CSR graph we process on
  * @param[in] src Source node where BFS starts
  * @param[in] max_grid_size Maximum CTA occupancy
  * @param[in] num_gpus Number of GPUs
@@ -48,115 +46,89 @@ using namespace gunrock::app::bfs;
  * @param[in] context Reference to CudaContext used by moderngpu functions
  *
  */
-template <
-    typename VertexId,
-    typename Value,
-    typename SizeT,
-    bool MARK_PREDECESSORS,
-    bool ENABLE_IDEMPOTENCE >
+template<typename VertexId, typename Value, typename SizeT,
+         bool MARK_PREDECESSORS, bool ENABLE_IDEMPOTENCE>
 void run_bfs(
-    GunrockGraph *ggraph_out,
-    const  Csr<VertexId, Value, SizeT> &ggraph_in,
-    const  VertexId src,
-    int    max_grid_size,
-    int    num_gpus,
-    double max_queue_sizing,
-    CudaContext& context) {
-    // Preparations
-    typedef BFSProblem <
-        VertexId,
-        SizeT,
-        Value,
-        MARK_PREDECESSORS,
-        ENABLE_IDEMPOTENCE,
-        (MARK_PREDECESSORS && ENABLE_IDEMPOTENCE) > Problem;
-
-    // Allocate host-side label array for gpu-computed results
-    VertexId *h_labels = (VertexId*)malloc(sizeof(VertexId) * ggraph_in.nodes);
+    GRGraph*       graph_o,
+    const Csr<VertexId, Value, SizeT>& csr,
+    const VertexId src,
+    const int      num_gpus,
+    const double   max_queue_sizing,
+    CudaContext&   context) {
+    typedef BFSProblem<VertexId, SizeT, Value, MARK_PREDECESSORS,
+        ENABLE_IDEMPOTENCE, (MARK_PREDECESSORS && ENABLE_IDEMPOTENCE)> Problem;
+    // Allocate host-side label array for GPU-computed results
+    VertexId *h_labels = (VertexId*)malloc(sizeof(VertexId) * csr.nodes);
     VertexId *h_preds = NULL;
     if (MARK_PREDECESSORS) {
-        //h_preds = (VertexId*)malloc(sizeof(VertexId) * ggraph_in.nodes);
+        // h_preds = (VertexId*)malloc(sizeof(VertexId) * csr.nodes);
     }
 
-    // Allocate BFS enactor map
-    BFSEnactor<false> bfs_enactor(false);
-
-    // Allocate problem on GPU
-    Problem *csr_problem = new Problem;
-    util::GRError(csr_problem->Init(
-                      false,
-                      ggraph_in,
-                      num_gpus),
-                  "Problem BFS Initialization Failed", __FILE__, __LINE__);
+    BFSEnactor<false> enactor(false);  // Allocate BFS enactor map
+    Problem *problem = new Problem;    // Allocate problem on GPU
 
-    // Perform BFS
-    GpuTimer gpu_timer;
+    util::GRError(problem->Init(false, csr, num_gpus),
+                  "BFS Problem Initialization Failed", __FILE__, __LINE__);
 
-    util::GRError(csr_problem->Reset(
-                      src, bfs_enactor.GetFrontierType(), max_queue_sizing),
+    util::GRError(problem->Reset(
+                      src, enactor.GetFrontierType(), max_queue_sizing),
                   "BFS Problem Data Reset Failed", __FILE__, __LINE__);
 
-    gpu_timer.Start();
-    util::GRError(bfs_enactor.template Enact<Problem>(
-                      context, csr_problem, src, max_grid_size),
+    GpuTimer gpu_timer; float elapsed = 0.0f; gpu_timer.Start();  // start
+
+    util::GRError(enactor.template Enact<Problem>(context, problem, src),
                   "BFS Problem Enact Failed", __FILE__, __LINE__);
-    gpu_timer.Stop();
 
-    float elapsed = gpu_timer.ElapsedMillis();
+    gpu_timer.Stop(); elapsed = gpu_timer.ElapsedMillis();  // elapsed time
+    printf(" device elapsed time: %.4f ms\n", elapsed);
 
-    // Copy out results back to Host
-    util::GRError(csr_problem->Extract(h_labels, h_preds),
+    util::GRError(problem->Extract(h_labels, h_preds),
                   "BFS Problem Data Extraction Failed", __FILE__, __LINE__);
 
-    // label per node to GunrockGraph struct
-    ggraph_out->node_values = (int*)&h_labels[0];
-
-    // Clean up
-    if (csr_problem) delete csr_problem;
-    //if (h_preds)     free(h_preds);
+    graph_o->node_values = (int*)&h_labels[0];  // label per node to graph_o
 
+    if (problem) { delete problem; }
+    if (h_preds) {  free(h_preds); }
     cudaDeviceSynchronize();
 }
 
 /**
  * @brief dispatch function to handle data_types
  *
- * @param[out] ggraph_out GunrockGraph type output
- * @param[in]  ggraph_in  GunrockGraph type input graph
- * @param[in]  bfs_config bfs specific configurations
- * @param[in]  data_type  bfs data_type configurations
- * @param[in]  context    moderngpu context
+ * @param[out] graph_o  GRGraph type output
+ * @param[in]  graph_i  GRGraph type input graph
+ * @param[in]  config   Specific configurations
+ * @param[in]  data_t   Data type configurations
+ * @param[in]  context  ModernGPU context
  */
 void dispatch_bfs(
-    GunrockGraph       *ggraph_out,
-    const GunrockGraph *ggraph_in,
-    GunrockConfig      bfs_config,
-    GunrockDataType    data_type,
-    CudaContext&       context) {
-    switch (data_type.VTXID_TYPE) {
+    GRGraph*       graph_o,
+    const GRGraph* graph_i,
+    const GRSetup  config,
+    const GRTypes  data_t,
+    CudaContext&   context) {
+    switch (data_t.VTXID_TYPE) {
     case VTXID_INT: {
-        switch (data_type.SIZET_TYPE) {
+        switch (data_t.SIZET_TYPE) {
         case SIZET_INT: {
-            switch (data_type.VALUE_TYPE) {
-            case VALUE_INT: {
-                // template type = <int, int, int>
+            switch (data_t.VALUE_TYPE) {
+            case VALUE_INT: {  // template type = <int, int, int>
                 // build input csr format graph
                 Csr<int, int, int> csr_graph(false);
-                csr_graph.nodes = ggraph_in->num_nodes;
-                csr_graph.edges = ggraph_in->num_edges;
-                csr_graph.row_offsets    = (int*)ggraph_in->row_offsets;
-                csr_graph.column_indices = (int*)ggraph_in->col_indices;
+                csr_graph.nodes = graph_i->num_nodes;
+                csr_graph.edges = graph_i->num_edges;
+                csr_graph.row_offsets    = (int*)graph_i->row_offsets;
+                csr_graph.column_indices = (int*)graph_i->col_indices;
 
                 // default configurations
-                int   src_node      = 0;       //!< default source vertex to start
-                int   num_gpus      = 1;       //!< number of GPUs for multi-gpu enactor to use
-                int   max_grid_size = 0;       //!< maximum grid size (0: leave it up to the enactor)
-                bool  mark_pred     = false;   //!< whether to mark predecessor or not
-                bool  idempotence   = false;   //!< whether or not to enable idempotence
-                float max_queue_sizing = 1.0f; //!< maximum size scaling factor for work queues
+                int   src_node      = 0;  // default source vertex to start
+                int   num_gpus      = 1;  // number of GPUs for multi-GPU
+                bool  mark_pred     = 0;  // whether to mark predecessor or not
+                bool  idempotence   = 0;  // whether or not enable idempotent
+                float max_queue_sizing = 1.0f;  // maximum size scaling factor
 
-                // determine source vertex to start bfs
-                switch (bfs_config.src_mode) {
+                // determine source vertex to start
+                switch (config.src_mode) {
                 case randomize: {
                     src_node = graphio::RandomNode(csr_graph.nodes);
                     break;
@@ -167,7 +139,7 @@ void dispatch_bfs(
                     break;
                 }
                 case manually: {
-                    src_node = bfs_config.src_node;
+                    src_node = config.src_node;
                     break;
                 }
                 default: {
@@ -175,26 +147,24 @@ void dispatch_bfs(
                     break;
                 }
                 }
-                mark_pred        = bfs_config.mark_pred;
-                idempotence      = bfs_config.idempotence;
-                max_queue_sizing = bfs_config.queue_size;
+                mark_pred        = config.mark_pred;
+                idempotence      = config.idempotence;
+                max_queue_sizing = config.queue_size;
 
                 if (mark_pred) {
                     if (idempotence) {
                         run_bfs<int, int, int, true, true>(
-                            ggraph_out,
+                            graph_o,
                             csr_graph,
                             src_node,
-                            max_grid_size,
                             num_gpus,
                             max_queue_sizing,
                             context);
                     } else {
                         run_bfs<int, int, int, true, false>(
-                            ggraph_out,
+                            graph_o,
                             csr_graph,
                             src_node,
-                            max_grid_size,
                             num_gpus,
                             max_queue_sizing,
                             context);
@@ -202,19 +172,17 @@ void dispatch_bfs(
                 } else {
                     if (idempotence) {
                         run_bfs<int, int, int, false, true>(
-                            ggraph_out,
+                            graph_o,
                             csr_graph,
                             src_node,
-                            max_grid_size,
                             num_gpus,
                             max_queue_sizing,
                             context);
                     } else {
                         run_bfs<int, int, int, false, false>(
-                            ggraph_out,
+                            graph_o,
                             csr_graph,
                             src_node,
-                            max_grid_size,
                             num_gpus,
                             max_queue_sizing,
                             context);
@@ -225,14 +193,12 @@ void dispatch_bfs(
                 csr_graph.column_indices = NULL;
                 break;
             }
-            case VALUE_UINT: {
-                // template type = <int, uint, int>
+            case VALUE_UINT: {  // template type = <int, uint, int>
                 // not yet support
                 printf("Not Yet Support This DataType Combination.\n");
                 break;
             }
-            case VALUE_FLOAT: {
-                // template type = <int, float, int>
+            case VALUE_FLOAT: {  // template type = <int, float, int>
                 // not yet support
                 printf("Not Yet Support This DataType Combination.\n");
                 break;
@@ -249,24 +215,70 @@ void dispatch_bfs(
 /*
  * @brief gunrock_bfs function
  *
- * @param[out] ggraph_out output subgraph of bfs problem
- * @param[in]  ggraph_in  input graph need to process on
- * @param[in]  bfs_config gunrock primitive specific configurations
- * @param[in]  data_type  gunrock datatype struct
+ * @param[out] graph_o output subgraph of the problem
+ * @param[in]  graph_i input graph need to process on
+ * @param[in]  config  gunrock primitive specific configurations
+ * @param[in]  data_t  gunrock data_t struct
  */
-void gunrock_bfs_func(
-    GunrockGraph       *ggraph_out,
-    const GunrockGraph *ggraph_in,
-    GunrockConfig      bfs_config,
-    GunrockDataType    data_type) {
-
-    // moderngpu preparations
-    int device = 0;
-    device = bfs_config.device;
+void gunrock_bfs(
+    GRGraph*       graph_o,
+    const GRGraph* graph_i,
+    const GRSetup  config,
+    const GRTypes  data_t) {
+    unsigned int device = 0;
+    device = config.device;
     ContextPtr context = mgpu::CreateCudaDevice(device);
+    dispatch_bfs(graph_o, graph_i, config, data_t, *context);
+}
+
+/*
+ * @brief Simple interface take in CSR arrays as input
+ * @param[out] bfs_label   Return BFS labels per nodes
+ * @param[in]  num_nodes   Number of nodes of the input graph
+ * @param[in]  num_edges   Number of edges of the input graph
+ * @param[in]  row_offsets CSR-formatted graph input row offsets
+ * @param[in]  col_indices CSR-formatted graph input column indices
+ * @param[in]  source      Source to begin traverse
+ */
+void bfs(
+    int*       bfs_label,
+    const int  num_nodes,
+    const int  num_edges,
+    const int* row_offsets,
+    const int* col_indices,
+    const int  source) {
+    printf("-------------------- setting --------------------\n");
+
+    struct GRTypes data_t;          // primitive-specific data types
+    data_t.VTXID_TYPE = VTXID_INT;  // integer
+    data_t.SIZET_TYPE = SIZET_INT;  // integer
+    data_t.VALUE_TYPE = VALUE_INT;  // integer
+
+    struct GRSetup config;          // primitive-specific configures
+    config.device      =      0;    // setting device to run
+    config.src_node    = source;    // source vertex to begin
+    config.mark_pred   =  false;    // do not mark predecessors
+    config.idempotence =  false;    // whether enable idempotent
+    config.queue_size  =   1.0f;    // maximum queue size factor
+
+    struct GRGraph *graph_o = (struct GRGraph*)malloc(sizeof(struct GRGraph));
+    struct GRGraph *graph_i = (struct GRGraph*)malloc(sizeof(struct GRGraph));
+
+    graph_i->num_nodes   = num_nodes;
+    graph_i->num_edges   = num_edges;
+    graph_i->row_offsets = (void*)&row_offsets[0];
+    graph_i->col_indices = (void*)&col_indices[0];
+
+    printf(" loaded %d nodes and %d edges\n", num_nodes, num_edges);
+
+    printf("-------------------- running --------------------\n");
+    gunrock_bfs(graph_o, graph_i, config, data_t);
+    memcpy(bfs_label, (int*)graph_o->node_values, num_nodes * sizeof(int));
+
+    if (graph_i) free(graph_i);
+    if (graph_o) free(graph_o);
 
-    // launch dispatch function
-    dispatch_bfs(ggraph_out, ggraph_in, bfs_config, data_type, *context);
+    printf("------------------- completed -------------------\n");
 }
 
 // Leave this at the end of the file
diff --git a/gunrock/app/cc/cc_app.cu b/gunrock/app/cc/cc_app.cu
index 97723087c..b4ac393b6 100644
--- a/gunrock/app/cc/cc_app.cu
+++ b/gunrock/app/cc/cc_app.cu
@@ -8,20 +8,15 @@
 /**
  * @file cc_app.cu
  *
- * @brief connected component implementation.
+ * @brief connected component (CC) application
  */
 
-#include <stdio.h>
-#include <string>
-#include <deque>
-#include <vector>
-#include <iostream>
 #include <gunrock/gunrock.h>
 
-// Graph construction utils
+// graph construction utilities
 #include <gunrock/graphio/market.cuh>
 
-// CC includes
+// connected component includes
 #include <gunrock/app/cc/cc_enactor.cuh>
 #include <gunrock/app/cc/cc_problem.cuh>
 #include <gunrock/app/cc/cc_functor.cuh>
@@ -38,112 +33,88 @@ using namespace gunrock::app::cc;
  * @tparam Value
  * @tparam SizeT
  *
- * @param[out] ggraph_out Pointer to output CSR graph
+ * @param[out] graph_o Pointer to output CSR graph
  * @param[in] csr_graph Reference to the CSR graph we process on
  * @param[in] max_grid_size Maximum CTA occupancy for CC kernels
  * @param[in] num_gpus Number of GPUs
  */
-template <
-    typename VertexId,
-    typename Value,
-    typename SizeT >
+template<typename VertexId, typename Value, typename SizeT>
 void run_cc(
-    GunrockGraph *ggraph_out,
-    unsigned int *components,
-    const Csr<VertexId, Value, SizeT> &csr_graph,
+    GRGraph*      graph_o,
+    unsigned int* components,
+    const Csr<VertexId, Value, SizeT>& csr,
     const int    max_grid_size,
     const int    num_gpus) {
-
-    // Define CCProblem
-    typedef CCProblem <
-        VertexId,
-        SizeT,
-        Value,
-        true > Problem; //use double buffer
+    typedef CCProblem<VertexId, SizeT, Value, true> Problem; // double buffer
 
     // Allocate host-side label array for gpu-computed results
     VertexId *h_component_ids
-        = (VertexId*)malloc(sizeof(VertexId) * csr_graph.nodes);
-
-    // Allocate CC enactor map
-    CCEnactor<false> cc_enactor(false);
+        = (VertexId*)malloc(sizeof(VertexId) * csr.nodes);
+    CCEnactor<false> cc_enactor(false);  // Allocate CC enactor map
+    Problem *problem = new Problem;  // Allocate problem on GPU
 
-    // Allocate problem on GPU
-    Problem *csr_problem = new Problem;
-    util::GRError(csr_problem->Init(
-                      false,
-                      csr_graph,
-                      num_gpus),
+    util::GRError(problem->Init(false, csr, num_gpus),
                   "CC Problem Initialization Failed", __FILE__, __LINE__);
 
-    // Reset CC Problem Data
-    util::GRError(csr_problem->Reset(
+    util::GRError(problem->Reset(
                       cc_enactor.GetFrontierType()),
                   "CC Problem Data Reset Failed", __FILE__, __LINE__);
 
-    // Perform Connected Component
-    GpuTimer gpu_timer;
-    gpu_timer.Start();
-    // Lunch CC Enactor
+    GpuTimer gpu_timer; float elapsed = 0.0f; gpu_timer.Start();  // start
+
     util::GRError(cc_enactor.template Enact<Problem>(
-                      csr_problem, max_grid_size),
+                      problem, max_grid_size),
                   "CC Problem Enact Failed", __FILE__, __LINE__);
-    gpu_timer.Stop();
-    float elapsed = gpu_timer.ElapsedMillis();
 
-    // Copy out results back to Host Device
-    util::GRError(csr_problem->Extract(h_component_ids),
+    gpu_timer.Stop(); elapsed = gpu_timer.ElapsedMillis();  // elapsed time
+    printf(" device elapsed time: %.4f ms\n", elapsed);
+
+    util::GRError(problem->Extract(h_component_ids),
                   "CC Problem Data Extraction Failed", __FILE__, __LINE__);
 
     // Compute number of components in graph
-    unsigned int temp = csr_problem->num_components;
+    unsigned int temp = problem->num_components;
     *components = temp;
 
-    // copy component_id per node to GunrockGraph struct
-    ggraph_out->node_values = (int*)&h_component_ids[0];
-
-    printf("GPU Connected Component finished in %lf msec.\n", elapsed);
-
-    // Cleanup
-    if (csr_problem)  delete csr_problem;
+    // copy component_id per node to GRGraph struct
+    graph_o->node_values = (int*)&h_component_ids[0];
 
+    if (problem)  delete problem;
     cudaDeviceSynchronize();
 }
 
 /**
  * @brief dispatch function to handle data_types
  *
- * @param[out] ggraph_out GunrockGraph type output
- * @param[in]  ggraph_in  GunrockGraph type input graph
- * @param[in]  cc_config  cc specific configurations
- * @param[in]  data_type  data type configurations
+ * @param[out] graph_o GRGraph type output
+ * @param[in]  graph_i GRGraph type input graph
+ * @param[in]  config  cc specific configurations
+ * @param[in]  data_t  data type configurations
  */
 void dispatch_cc(
-    GunrockGraph          *ggraph_out,
-    unsigned int          *components,
-    const GunrockGraph    *ggraph_in,
-    const GunrockConfig   cc_config,
-    const GunrockDataType data_type) {
-    switch (data_type.VTXID_TYPE) {
+    GRGraph*       graph_o,
+    unsigned int*  components,
+    const GRGraph* graph_i,
+    const GRSetup  config,
+    const GRTypes  data_t) {
+    switch (data_t.VTXID_TYPE) {
     case VTXID_INT: {
-        switch (data_type.SIZET_TYPE) {
+        switch (data_t.SIZET_TYPE) {
         case SIZET_INT: {
-            switch (data_type.VALUE_TYPE) {
-            case VALUE_INT: {
-                // template type = <int, int, int>
+            switch (data_t.VALUE_TYPE) {
+            case VALUE_INT: {  // template type = <int, int, int>
                 // build input csr format graph
                 Csr<int, int, int> csr_graph(false);
-                csr_graph.nodes = ggraph_in->num_nodes;
-                csr_graph.edges = ggraph_in->num_edges;
-                csr_graph.row_offsets    = (int*)ggraph_in->row_offsets;
-                csr_graph.column_indices = (int*)ggraph_in->col_indices;
+                csr_graph.nodes = graph_i->num_nodes;
+                csr_graph.edges = graph_i->num_edges;
+                csr_graph.row_offsets    = (int*)graph_i->row_offsets;
+                csr_graph.column_indices = (int*)graph_i->col_indices;
 
-                int max_grid_size = 0; //!< 0: leave it up to the enactor
-                int num_gpus      = 1; //!< number of GPUs
+                int max_grid_size = 0;  // 0: leave it up to the enactor
+                int num_gpus      = 1;  // number of GPUs
 
-                // lunch cc dispatch function
                 run_cc<int, int, int>(
-                    ggraph_out,
+                    graph_o,
                     (unsigned int*)components,
                     csr_graph,
                     max_grid_size,
@@ -154,13 +125,11 @@ void dispatch_cc(
                 csr_graph.column_indices = NULL;
                 break;
             }
-            case VALUE_UINT: {
-                // template type = <int, uint, int>
+            case VALUE_UINT: {  // template type = <int, uint, int>
                 printf("Not Yet Support This DataType Combination.\n");
                 break;
             }
-            case VALUE_FLOAT: {
-                // template type = <int, float, int>
+            case VALUE_FLOAT: {  // template type = <int, float, int>
                 printf("Not Yet Support This DataType Combination.\n");
                 break;
             }
@@ -176,20 +145,65 @@ void dispatch_cc(
 /*
  * @brief gunrock_cc function
  *
- * @param[out] ggraph_out output subgraph of cc problem
- * @param[in]  ggraph_in  input graph need to process on
- * @param[in]  cc_configs primitive specific configurations
- * @param[in]  data_type  gunrock data_type struct
+ * @param[out] graph_o output subgraph of cc problem
+ * @param[in]  graph_i input graph need to process on
+ * @param[in]  config  primitive specific configurations
+ * @param[in]  data_t  gunrock data_t struct
+ */
+void gunrock_cc(
+    GRGraph       *graph_o,
+    unsigned int  *components,
+    const GRGraph *graph_i,
+    const GRSetup  config,
+    const GRTypes  data_t) {
+    dispatch_cc(graph_o, components, graph_i, config, data_t);
+}
+
+/*
+ * @brief Simple interface take in CSR arrays as input
+ * @param[out] components  Return component ID for each node
+ * @param[out] num_comps   Return number of components calculated
+ * @param[in]  num_nodes   Number of nodes of the input graph
+ * @param[in]  num_edges   Number of edges of the input graph
+ * @param[in]  row_offsets CSR-formatted graph input row offsets
+ * @param[in]  col_indices CSR-formatted graph input column indices
  */
-void gunrock_cc_func(
-    GunrockGraph          *ggraph_out,
-    unsigned int          *components,
-    const GunrockGraph    *ggraph_in,
-    const GunrockConfig   cc_configs,
-    const GunrockDataType data_type) {
-
-    // lunch dispatch function
-    dispatch_cc(ggraph_out, components, ggraph_in, cc_configs, data_type);
+int cc(
+    int*       components,
+    const int  num_nodes,
+    const int  num_edges,
+    const int* row_offsets,
+    const int* col_indices) {
+    printf("-------------------- setting --------------------\n");
+
+    struct GRTypes data_t;          // primitive-specific data types
+    data_t.VTXID_TYPE = VTXID_INT;  // integer
+    data_t.SIZET_TYPE = SIZET_INT;  // integer
+    data_t.VALUE_TYPE = VALUE_INT;  // integer
+
+    struct GRSetup config;  // primitive-specific configures
+    config.device = 0;      // setting device to run
+
+    unsigned int num_components = 0;
+    struct GRGraph *graph_o = (struct GRGraph*)malloc(sizeof(struct GRGraph));
+    struct GRGraph *graph_i = (struct GRGraph*)malloc(sizeof(struct GRGraph));
+
+    graph_i->num_nodes   = num_nodes;
+    graph_i->num_edges   = num_edges;
+    graph_i->row_offsets = (void*)&row_offsets[0];
+    graph_i->col_indices = (void*)&col_indices[0];
+
+    printf(" loaded %d nodes and %d edges\n", num_nodes, num_edges);
+
+    printf("-------------------- running --------------------\n");
+    gunrock_cc(graph_o, &num_components, graph_i, config, data_t);
+    memcpy(components, (int*)graph_o->node_values, num_nodes * sizeof(int));
+
+    if (graph_i) free(graph_i);
+    if (graph_o) free(graph_o);
+
+    printf("------------------- completed -------------------\n");
+    return num_components;
 }
 
 // Leave this at the end of the file
diff --git a/gunrock/app/mst/mst_app.cu b/gunrock/app/mst/mst_app.cu
new file mode 100644
index 000000000..55e350471
--- /dev/null
+++ b/gunrock/app/mst/mst_app.cu
@@ -0,0 +1,177 @@
+// ----------------------------------------------------------------------------
+// Gunrock -- Fast and Efficient GPU Graph Library
+// ----------------------------------------------------------------------------
+// This source code is distributed under the terms of LICENSE.TXT
+// in the root directory of this source distribution.
+// ----------------------------------------------------------------------------
+
+/**
+ * @file mst_app.cu
+ *
+ * @brief minimum spanning tree (MST) application
+ */
+
+#include <gunrock/gunrock.h>
+
+// graph construction utilities
+#include <gunrock/graphio/market.cuh>
+
+// primitive-specific includes
+#include <gunrock/app/mst/mst_enactor.cuh>
+#include <gunrock/app/mst/mst_problem.cuh>
+#include <gunrock/app/mst/mst_functor.cuh>
+
+#include <moderngpu.cuh>
+
+using namespace gunrock;
+using namespace gunrock::util;
+using namespace gunrock::oprtr;
+using namespace gunrock::app::mst;
+
+/**
+ * @brief run minimum spanning tree
+ *
+ * @tparam VertexId
+ * @tparam Value
+ * @tparam SizeT
+ *
+ * @param[out] graph_o   GRGraph type output graph
+ * @param[in]  csr       Reference to the CSR graph we process on
+ * @param[in]  max_grid  Maximum CTA occupancy
+ * @param[in]  num_gpus  Number of GPUs
+ * @param[in]  context   Modern GPU context
+ */
+template<typename VertexId, typename Value, typename SizeT>
+void run_mst(
+    GRGraph *graph_o,
+    const Csr<VertexId, Value, SizeT> &csr,
+    const int    max_grid,
+    const int    num_gpus,
+    CudaContext  &context) {
+    typedef MSTProblem<VertexId, SizeT, Value, true> Problem;  // preparations
+    MSTEnactor<false> enactor(false);                          // enactor map
+    VertexId *h_mst  = new VertexId[csr.edges];                // results array
+    Problem *problem = new Problem;                            // problem on GPU
+
+    util::GRError(problem->Init(false, csr, num_gpus),
+                  "MST Data Initialization Failed", __FILE__, __LINE__);
+
+    util::GRError(problem->Reset(enactor.GetFrontierType()),
+                  "MST Data Reset Failed", __FILE__, __LINE__);
+
+    util::GRError(enactor.template Enact<Problem>(context, problem, max_grid),
+                  "MST Enact Failed", __FILE__, __LINE__);
+
+    util::GRError(problem->Extract(h_mst),
+                  "MST Data Extraction Failed", __FILE__, __LINE__);
+
+    graph_o->edge_values = (int*)&h_mst[0];  // output: 0|1 mask for all edges
+
+    if (problem) { delete problem; }
+
+    cudaDeviceSynchronize();
+}
+
+/**
+ * @brief dispatch function to handle data types
+ *
+ * @param[out] graph_o  GRGraph type output graph
+ * @param[in]  graph_i  GRGraph type input graph
+ * @param[in]  config   MST-specific configurations
+ * @param[in]  data_t   Data type configurations
+ * @param[in]  context  Modern GPU context parameter
+ */
+void dispatch_mst(
+    GRGraph          *graph_o,
+    const GRGraph    *graph_i,
+    const GRSetup   config,
+    const GRTypes data_t,
+    CudaContext           &context) {
+    switch (data_t.VTXID_TYPE) {
+    case VTXID_INT: {
+        switch (data_t.SIZET_TYPE) {
+        case SIZET_INT: {
+            switch (data_t.VALUE_TYPE) {
+            case VALUE_INT: {  // template type = <int, int, int>
+                // create a CSR formatted graph
+                Csr<int, int, int> csr(false);
+                csr.nodes = graph_i->num_nodes;
+                csr.edges = graph_i->num_edges;
+                csr.row_offsets    = (int*)graph_i->row_offsets;
+                csr.column_indices = (int*)graph_i->col_indices;
+                csr.edge_values    = (int*)graph_i->edge_values;
+
+                // configurations if necessary
+                int num_gpus = 1;  // number of GPU(s) to use
+                int max_grid = 0;  // leave it up to the enactor
+                run_mst<int, int, int>(
+                    graph_o, csr, max_grid, num_gpus, context);
+
+                // reset for free memory
+                csr.row_offsets    = NULL;
+                csr.column_indices = NULL;
+                csr.edge_values    = NULL;
+                break;
+            }
+            case VALUE_UINT: {  // template type = <int, unsigned int, int>
+                printf("Not Yet Support This DataType Combination.\n");
+                break;
+            }
+            case VALUE_FLOAT: {  // template type = <int, float, int>
+                // create a CSR formatted graph
+                Csr<int, float, int> csr(false);
+                csr.nodes = graph_i->num_nodes;
+                csr.edges = graph_i->num_edges;
+                csr.row_offsets    = (int*)graph_i->row_offsets;
+                csr.column_indices = (int*)graph_i->col_indices;
+                csr.edge_values  = (float*)graph_i->edge_values;
+
+                // configurations if necessary
+                int num_gpus = 1;  // number of GPU(s) to use
+                int max_grid = 0;  // leave it up to the enactor
+                run_mst<int, float, int>(
+                    graph_o, csr, max_grid, num_gpus, context);
+
+                // reset for free memory
+                csr.row_offsets    = NULL;
+                csr.column_indices = NULL;
+                csr.edge_values    = NULL;
+                break;
+            }
+            }
+            break;
+        }
+        }
+        break;
+    }
+    }
+}
+
+/**
+ * @brief run_mst entry
+ *
+ * @tparam VertexId
+ * @tparam Value
+ * @tparam SizeT
+ *
+ * @param[out] graph_o GRGraph type output graph
+ * @param[in]  graph_i GRGraph type input graph
+ * @param[in]  config  Primitive-specific configurations
+ * @param[in]  data_t  Data type configurations
+ */
+void gunrock_mst(
+    GRGraph       *graph_o,
+    const GRGraph *graph_i,
+    const GRSetup  config,
+    const GRTypes  data_t) {
+    unsigned int device = 0;
+    device = config.device;
+    ContextPtr context = mgpu::CreateCudaDevice(device);
+    dispatch_mst(graph_o, graph_i, config, data_t, *context);
+}
+
+// Leave this at the end of the file
+// Local Variables:
+// mode:c++
+// c-file-style: "NVIDIA"
+// End:
diff --git a/gunrock/app/mst/mst_enactor.cuh b/gunrock/app/mst/mst_enactor.cuh
index 7d7d7a0a1..fa8dac343 100644
--- a/gunrock/app/mst/mst_enactor.cuh
+++ b/gunrock/app/mst/mst_enactor.cuh
@@ -215,10 +215,8 @@ public:
     typedef SuRmFunctor <VertexId, SizeT, VertexId, MSTProblem> SuRmFunctor;
     typedef EIdxFunctor <VertexId, SizeT, VertexId, MSTProblem> EIdxFunctor;
     typedef MarkFunctor <VertexId, SizeT, VertexId, MSTProblem> MarkFunctor;
-    //typedef OrFunctor   <VertexId, SizeT, VertexId, MSTProblem> OrFunctor;
 
     cudaError_t retval = cudaSuccess;
-
     unsigned int *d_scanned_edges = NULL;
 
     do
@@ -297,9 +295,9 @@ public:
           problem->data_slices[0]->d_keys_array,
           problem->data_slices[0]->d_edge_weights,
           graph_slice->edges,
-          std::numeric_limits<int>::max(),
-          mgpu::minimum<int>(),
-          mgpu::equal_to<int>(),
+          std::numeric_limits<Value>::max(),
+          mgpu::minimum<Value>(),
+          mgpu::equal_to<Value>(),
           problem->data_slices[0]->d_reduced_keys,
           problem->data_slices[0]->d_reduced_vals,
           &num_segments, (int*)0, context);
@@ -341,8 +339,8 @@ public:
         util::MemsetKernel<<<128, 128>>>(problem->data_slices[0]->d_successors,
           std::numeric_limits<int>::max(), graph_slice->nodes);
         util::MemsetKernel<<<128, 128>>>(
-          problem->data_slices[0]->d_temp_storage,
-          std::numeric_limits<int>::max(), graph_slice->nodes);
+          problem->data_slices[0]->d_temp_index,
+          std::numeric_limits<VertexId>::max(), graph_slice->nodes);
         util::MemsetIdxKernel<<<128, 128>>>(
           graph_slice->frontier_queues.d_keys[frontier_attribute.selector],
           graph_slice->nodes);
@@ -680,41 +678,41 @@ public:
         ////////////////////////////////////////////////////////////////////////
         // filter to remove all -1 in d_col_indices
         util::MemsetCopyVectorKernel<<<128, 128>>>(
-          problem->data_slices[0]->d_temp_storage,
+          problem->data_slices[0]->d_temp_index,
           problem->data_slices[0]->d_col_indices,
           graph_slice->edges);
         util::CUBSelect<VertexId, SizeT>(
-          problem->data_slices[0]->d_temp_storage, graph_slice->edges,
+          problem->data_slices[0]->d_temp_index, graph_slice->edges,
           problem->data_slices[0]->d_col_indices, num_selected);
 
         ////////////////////////////////////////////////////////////////////////
         // filter to remove all -1 in d_edge_weights
         util::MemsetCopyVectorKernel<<<128, 128>>>(
-          problem->data_slices[0]->d_temp_storage,
+          problem->data_slices[0]->d_temp_value,
           problem->data_slices[0]->d_edge_weights,
           graph_slice->edges);
-        util::CUBSelect<VertexId, SizeT>(
-          problem->data_slices[0]->d_temp_storage, graph_slice->edges,
+        util::CUBSelect<Value, SizeT>(
+          problem->data_slices[0]->d_temp_value, graph_slice->edges,
           problem->data_slices[0]->d_edge_weights, num_selected);
 
         ////////////////////////////////////////////////////////////////////////
         // filter to remove all -1 in d_keys_array
         util::MemsetCopyVectorKernel<<<128, 128>>>(
-          problem->data_slices[0]->d_temp_storage,
+          problem->data_slices[0]->d_temp_index,
           problem->data_slices[0]->d_keys_array,
           graph_slice->edges);
         util::CUBSelect<VertexId, SizeT>(
-          problem->data_slices[0]->d_temp_storage, graph_slice->edges,
+          problem->data_slices[0]->d_temp_index, graph_slice->edges,
           problem->data_slices[0]->d_keys_array, num_selected);
 
         ////////////////////////////////////////////////////////////////////////
         // filter to remove all -1 in d_origin_edges
         util::MemsetCopyVectorKernel<<<128, 128>>>(
-          problem->data_slices[0]->d_temp_storage,
+          problem->data_slices[0]->d_temp_index,
           problem->data_slices[0]->d_origin_edges,
           graph_slice->edges);
         util::CUBSelect<VertexId, SizeT>(
-          problem->data_slices[0]->d_temp_storage, graph_slice->edges,
+          problem->data_slices[0]->d_temp_index, graph_slice->edges,
           problem->data_slices[0]->d_origin_edges, num_selected);
 
         if (DEBUG) printf("  * finished remove edges in one super-vertex.\n");
@@ -785,12 +783,12 @@ public:
         ////////////////////////////////////////////////////////////////////////
         // bring edges, weights, origin_eids together according to keys
         util::MemsetCopyVectorKernel<<<128, 128>>>(
-          problem->data_slices[0]->d_temp_storage,
+          problem->data_slices[0]->d_temp_index,
           problem->data_slices[0]->d_keys_array,
           graph_slice->edges);
 
         util::MemsetCopyVectorKernel<<<128, 128>>>(
-          problem->data_slices[0]->d_tmp_storage,
+          problem->data_slices[0]->d_super_edges,  // used as temp_index
           problem->data_slices[0]->d_keys_array,
           graph_slice->edges);
 
@@ -801,276 +799,15 @@ public:
 
         util::CUBRadixSort<VertexId, Value>(
           true, graph_slice->edges,
-          problem->data_slices[0]->d_temp_storage,
+          problem->data_slices[0]->d_temp_index,
           problem->data_slices[0]->d_edge_weights);
 
         util::CUBRadixSort<VertexId, VertexId>(
           true, graph_slice->edges,
-          problem->data_slices[0]->d_tmp_storage,
+          problem->data_slices[0]->d_super_edges,  // used as temp_index
           problem->data_slices[0]->d_origin_edges);
 
         if (DEBUG) printf("  * finished sort according to new vertex ids.\n");
-
-        /*
-        ////////////////////////////////////////////////////////////////////////
-        // remove duplicated edges between super-vertices (optional operation)
-        if (false)//(enactor_stats.iteration == 0)
-        {
-          //////////////////////////////////////////////////////////////////////
-          // generate edge flag array based on source vertices list [1]
-          // using MarkSegmentFromKeys on d_keys_array
-          util::MemsetKernel<unsigned int><<<128, 128>>>(
-            problem->data_slices[0]->d_flags_array, 0, graph_slice->edges);
-          util::MarkSegmentFromKeys<<<128, 128>>>(
-            problem->data_slices[0]->d_flags_array,
-            problem->data_slices[0]->d_keys_array,
-            graph_slice->edges);
-
-          if (debug_info)
-          {
-            printf(":: mark segment to generate edge flag array [1] ::");
-            util::DisplayDeviceResults(
-              problem->data_slices[0]->d_flags_array, graph_slice->edges);
-          }
-
-          //////////////////////////////////////////////////////////////////////
-          // generate edge flag array based on destination vertices list [2]
-          // create a flags array on the output of segmented sort based on the
-          // difference in u-v pair using MarkSegmentsFromKeys kernel function
-          util::MarkSegmentFromKeys<<<128, 128>>>(
-            problem->data_slices[0]->d_edge_flags,
-            problem->data_slices[0]->d_col_indices,
-            graph_slice->edges);
-
-          if (debug_info)
-          {
-            printf(":: mark segment to generate edge flag array [2] ::");
-            util::DisplayDeviceResults(
-              problem->data_slices[0]->d_edge_flags, graph_slice->edges);
-          }
-
-          //////////////////////////////////////////////////////////////////////
-          // do or operation for d_edge_flags and d_flags_array - u-v pair
-          frontier_attribute.queue_index  = 0;
-          frontier_attribute.selector     = 0;
-          frontier_attribute.queue_length = graph_slice->edges;
-          frontier_attribute.queue_reset  = true;
-
-          gunrock::oprtr::filter::Kernel
-            <FilterKernelPolicy, MSTProblem, OrFunctor>
-            <<<enactor_stats.filter_grid_size, FilterKernelPolicy::THREADS>>>(
-            enactor_stats.iteration + 1,
-            frontier_attribute.queue_reset,
-            frontier_attribute.queue_index,
-            enactor_stats.num_gpus,
-            frontier_attribute.queue_length,
-            NULL,
-            graph_slice->frontier_queues.d_values[frontier_attribute.selector],
-            NULL,
-            graph_slice->frontier_queues.d_values[frontier_attribute.selector^1],
-            data_slice,
-            NULL,
-            work_progress,
-            graph_slice->frontier_elements[frontier_attribute.selector],
-            graph_slice->frontier_elements[frontier_attribute.selector^1],
-            enactor_stats.filter_kernel_stats);
-
-          if (DEBUG && (retval = util::GRError(cudaDeviceSynchronize(),
-            "filter::Kernel failed", __FILE__, __LINE__))) break;
-
-          if (DEBUG) printf("  * finished edge flags - second edge removal.\n");
-
-          if (debug_info)
-          {
-            printf(":: duplicated edges between super-vertex d_edge_flags ::");
-            util::DisplayDeviceResults(
-              problem->data_slices[0]->d_edge_flags, graph_slice->edges);
-            printf(":: edge removal u list (d_keys_array) ::");
-            util::DisplayDeviceResults(
-              problem->data_slices[0]->d_keys_array, graph_slice->edges);
-            printf(":: edge removal v list (d_col_indices) ::");
-            util::DisplayDeviceResults(
-              problem->data_slices[0]->d_col_indices, graph_slice->edges);
-            printf(":: edge removal w list (d_edge_weights) ::");
-            util::DisplayDeviceResults(
-              problem->data_slices[0]->d_edge_weights, graph_slice->edges);
-          }
-
-          //////////////////////////////////////////////////////////////////////
-          // scan edge_flags to get edge_keys used for sorting
-          Scan<MgpuScanTypeInc>(
-            (int*)problem->data_slices[0]->d_edge_flags, graph_slice->edges,
-            (int)0, mgpu::plus<int>(), (int*)0, (int*)0,
-            (int*)problem->data_slices[0]->d_temp_storage, context);
-
-          // set first bit of edge_flags back to 1
-          util::MemsetKernel<unsigned int><<<1, 1>>>(
-            problem->data_slices[0]->d_edge_flags, 1, 1);
-
-          //////////////////////////////////////////////////////////////////////
-          // calculate the number of segments for edge_offsets
-          num_segments = Reduce(
-            problem->data_slices[0]->d_edge_flags, graph_slice->edges, context);
-
-          //////////////////////////////////////////////////////////////////////
-          // generate edge_offsets used for SegSortFromIndices
-          // edge_flags stored in d_row_offsets
-          frontier_attribute.queue_index  = 0;
-          frontier_attribute.selector     = 0;
-          frontier_attribute.queue_length = graph_slice->edges;
-          frontier_attribute.queue_reset  = true;
-
-          gunrock::oprtr::filter::Kernel
-            <FilterKernelPolicy, MSTProblem, EIdxFunctor>
-            <<<enactor_stats.filter_grid_size, FilterKernelPolicy::THREADS>>>(
-            enactor_stats.iteration + 1,
-            frontier_attribute.queue_reset,
-            frontier_attribute.queue_index,
-            enactor_stats.num_gpus,
-            frontier_attribute.queue_length,
-            NULL,
-            graph_slice->frontier_queues.d_values[frontier_attribute.selector],
-            NULL,
-            graph_slice->frontier_queues.d_values[frontier_attribute.selector^1],
-            data_slice,
-            NULL,
-            work_progress,
-            graph_slice->frontier_elements[frontier_attribute.selector],
-            graph_slice->frontier_elements[frontier_attribute.selector^1],
-            enactor_stats.filter_kernel_stats);
-
-          if (DEBUG && (retval = util::GRError(cudaDeviceSynchronize(),
-            "filter::Kernel failed", __FILE__, __LINE__))) break;
-
-          //////////////////////////////////////////////////////////////////////
-          // segmented sort d_col_indices, d_edge_weights and d_origin_edges
-          // copy d_edge_weights to d_temp_storage to use for segmented sort
-          util::MemsetCopyVectorKernel<<<128, 128>>>(
-            problem->data_slices[0]->d_temp_storage,
-            problem->data_slices[0]->d_edge_weights,
-            graph_slice->edges);
-
-          util::SegSortFromIndices<SizeT, VertexId, Value>(
-            context,
-            num_segments,
-            problem->data_slices[0]->d_row_offsets,
-            graph_slice->edges,
-            problem->data_slices[0]->d_edge_weights,
-            problem->data_slices[0]->d_col_indices);
-
-          util::SegSortFromIndices<SizeT, VertexId, VertexId>(
-            context,
-            num_segments,
-            problem->data_slices[0]->d_row_offsets,
-            graph_slice->edges,
-            problem->data_slices[0]->d_temp_storage,
-            problem->data_slices[0]->d_origin_edges);
-
-          if (DEBUG) printf("  * finished segmentedSort for edge reduction.\n");
-
-          if (debug_info)
-          {
-            printf(":: second reduction segmented sort d_col_indices ::");
-            util::DisplayDeviceResults(
-              problem->data_slices[0]->d_col_indices, graph_slice->edges);
-            printf(":: second reduction segmented sort d_edge_weights ::");
-            util::DisplayDeviceResults(
-              problem->data_slices[0]->d_edge_weights, graph_slice->edges);
-            printf(":: second reduction segmented sort d_origin_edges ::");
-            util::DisplayDeviceResults(
-              problem->data_slices[0]->d_origin_edges, graph_slice->edges);
-          }
-
-          //////////////////////////////////////////////////////////////////////
-          // mark -1 to edges that needed to be removed using advance kernel
-          frontier_attribute.queue_index  = 0;
-          frontier_attribute.selector     = 0;
-          frontier_attribute.queue_length = graph_slice->edges;
-          frontier_attribute.queue_reset  = true;
-
-          gunrock::oprtr::filter::Kernel
-            <FilterKernelPolicy, MSTProblem, SuRmFunctor>
-            <<<enactor_stats.filter_grid_size, FilterKernelPolicy::THREADS>>>(
-            enactor_stats.iteration + 1,
-            frontier_attribute.queue_reset,
-            frontier_attribute.queue_index,
-            enactor_stats.num_gpus,
-            frontier_attribute.queue_length,
-            NULL,
-            graph_slice->frontier_queues.d_values[frontier_attribute.selector],
-            NULL,
-            graph_slice->frontier_queues.d_values[frontier_attribute.selector^1],
-            data_slice,
-            NULL,
-            work_progress,
-            graph_slice->frontier_elements[frontier_attribute.selector],
-            graph_slice->frontier_elements[frontier_attribute.selector^1],
-            enactor_stats.filter_kernel_stats);
-
-          if (DEBUG && (retval = util::GRError(cudaDeviceSynchronize(),
-            "filter::Kernel failed", __FILE__, __LINE__))) break;
-
-          if (DEBUG) printf("  * finished mark -1 for duplicated edges.\n");
-
-          //////////////////////////////////////////////////////////////////////
-          // filter to remove all -1 in d_col_indices
-          util::MemsetCopyVectorKernel<<<128, 128>>>(
-            problem->data_slices[0]->d_temp_storage,
-            problem->data_slices[0]->d_col_indices,
-            graph_slice->edges);
-          util::CUBSelect<VertexId, SizeT>(
-            problem->data_slices[0]->d_temp_storage,
-            graph_slice->edges,
-            problem->data_slices[0]->d_col_indices,
-            num_selected);
-
-          //////////////////////////////////////////////////////////////////////
-          // filter to remove all -1 in d_edge_weights
-          util::MemsetCopyVectorKernel<<<128, 128>>>(
-            problem->data_slices[0]->d_temp_storage,
-            problem->data_slices[0]->d_edge_weights,
-            graph_slice->edges);
-          util::CUBSelect<VertexId, SizeT>(
-            problem->data_slices[0]->d_temp_storage,
-            graph_slice->edges,
-            problem->data_slices[0]->d_edge_weights,
-            num_selected);
-
-          //////////////////////////////////////////////////////////////////////
-          // filter to remove all -1 in d_keys_array
-          util::MemsetCopyVectorKernel<<<128, 128>>>(
-            problem->data_slices[0]->d_temp_storage,
-            problem->data_slices[0]->d_keys_array,
-            graph_slice->edges);
-          util::CUBSelect<VertexId, SizeT>(
-            problem->data_slices[0]->d_temp_storage,
-            graph_slice->edges,
-            problem->data_slices[0]->d_keys_array,
-            num_selected);
-
-          //////////////////////////////////////////////////////////////////////
-          // filter to remove all -1 in d_origin_edges
-          util::MemsetCopyVectorKernel<<<128, 128>>>(
-            problem->data_slices[0]->d_temp_storage,
-            problem->data_slices[0]->d_origin_edges,
-            graph_slice->edges);
-          util::CUBSelect<VertexId, SizeT>(
-            problem->data_slices[0]->d_temp_storage,
-            graph_slice->edges,
-            problem->data_slices[0]->d_origin_edges,
-            num_selected);
-
-          if (DEBUG)
-            printf("  * finished remove edges between super-vertices.\n");
-
-          graph_slice->edges = *num_selected;
-
-          if (DEBUG)
-            printf("  * finished update #edges: %d [2]\n", graph_slice->edges);
-
-        } // end of removing duplicated edges between super-vertices
-        */
-
         if (DEBUG) printf(" (d). Constructing the Vertex List.\n");
 
         ////////////////////////////////////////////////////////////////////////
diff --git a/gunrock/app/mst/mst_functor.cuh b/gunrock/app/mst/mst_functor.cuh
index 8cf90fe0e..7f38e5d31 100644
--- a/gunrock/app/mst/mst_functor.cuh
+++ b/gunrock/app/mst/mst_functor.cuh
@@ -111,7 +111,8 @@ struct EdgeFunctor
     VertexId s_id, VertexId d_id, DataSlice *problem,
     VertexId e_id = 0, VertexId e_id_in = 0)
   {
-    return problem->d_successors[s_id] == d_id;
+    return problem->d_successors[s_id] == d_id &&
+      problem->d_reduced_vals[s_id] == problem->d_edge_weights[e_id];
   }
 
   /**
@@ -128,7 +129,7 @@ struct EdgeFunctor
     VertexId e_id = 0, VertexId e_id_in = 0)
   {
     util::io::ModifiedStore<ProblemData::QUEUE_WRITE_MODIFIER>::St(
-      problem->d_origin_edges[e_id], problem->d_temp_storage + s_id);
+      problem->d_origin_edges[e_id], problem->d_temp_index + s_id);
   }
 };
 
@@ -184,7 +185,7 @@ struct MarkFunctor
   {
     // mark minimum spanning tree output edges
     util::io::ModifiedStore<ProblemData::QUEUE_WRITE_MODIFIER>::St(
-      1, problem->d_mst_output + problem->d_temp_storage[s_id]);
+      1, problem->d_mst_output + problem->d_temp_index[s_id]);
   }
 };
 
@@ -246,7 +247,7 @@ struct CyRmFunctor
 
     // remove some edges in the MST output result
     util::io::ModifiedStore<ProblemData::QUEUE_WRITE_MODIFIER>::St(
-      0, problem->d_mst_output + problem->d_temp_storage[s_id]);
+      0, problem->d_mst_output + problem->d_temp_index[s_id]);
   }
 };
 
@@ -363,13 +364,14 @@ struct EgRmFunctor
     VertexId e_id = 0, VertexId e_id_in = 0)
   {
     util::io::ModifiedStore<ProblemData::QUEUE_WRITE_MODIFIER>::St(
-      -1, problem->d_keys_array + e_id);
+      (VertexId)-1, problem->d_keys_array + e_id);
     util::io::ModifiedStore<ProblemData::QUEUE_WRITE_MODIFIER>::St(
-      -1, problem->d_col_indices + e_id);
+      (VertexId)-1, problem->d_col_indices + e_id);
+    //util::io::ModifiedStore<ProblemData::QUEUE_WRITE_MODIFIER>::St(
+    //  (Value)-1, problem->d_edge_weights + e_id);
+    problem->d_edge_weights[e_id] = (Value) -1;
     util::io::ModifiedStore<ProblemData::QUEUE_WRITE_MODIFIER>::St(
-      -1, problem->d_edge_weights + e_id);
-    util::io::ModifiedStore<ProblemData::QUEUE_WRITE_MODIFIER>::St(
-      -1, problem->d_origin_edges + e_id);
+      (VertexId)-1, problem->d_origin_edges + e_id);
   }
 
   /**
@@ -505,7 +507,7 @@ struct EIdxFunctor
     VertexId node, DataSlice *problem, Value v = 0, SizeT nid=0)
   {
     util::io::ModifiedStore<ProblemData::QUEUE_WRITE_MODIFIER>::St(
-      node, problem->d_row_offsets + problem->d_temp_storage[node]);
+      node, problem->d_row_offsets + problem->d_temp_index[node]);
   }
 };
 
@@ -606,13 +608,13 @@ struct SuRmFunctor
     VertexId node, DataSlice *problem, Value v = 0, SizeT nid=0)
   {
     util::io::ModifiedStore<ProblemData::QUEUE_WRITE_MODIFIER>::St(
-      -1, problem->d_keys_array + node);
+      (VertexId)-1, problem->d_keys_array + node);
     util::io::ModifiedStore<ProblemData::QUEUE_WRITE_MODIFIER>::St(
-      -1, problem->d_col_indices + node);
+      (VertexId)-1, problem->d_col_indices + node);
     util::io::ModifiedStore<ProblemData::QUEUE_WRITE_MODIFIER>::St(
-      -1, problem->d_edge_weights + node);
+      (Value)   -1, problem->d_edge_weights + node);
     util::io::ModifiedStore<ProblemData::QUEUE_WRITE_MODIFIER>::St(
-      -1, problem->d_origin_edges + node);
+      (VertexId)-1, problem->d_origin_edges + node);
   }
 };
 
diff --git a/gunrock/app/mst/mst_problem.cuh b/gunrock/app/mst/mst_problem.cuh
index b2b8e7f1f..9f4e3db6b 100644
--- a/gunrock/app/mst/mst_problem.cuh
+++ b/gunrock/app/mst/mst_problem.cuh
@@ -47,7 +47,7 @@ struct MSTProblem : ProblemBase<_VertexId, _SizeT, _USE_DOUBLE_BUFFER>
   typedef _SizeT    SizeT;
   typedef _Value    Value;
 
-  static const bool MARK_PREDECESSORS  = true;
+  static const bool MARK_PREDECESSORS  =  true;
   static const bool ENABLE_IDEMPOTENCE = false;
 
   // helper structures
@@ -71,10 +71,10 @@ struct MSTProblem : ProblemBase<_VertexId, _SizeT, _USE_DOUBLE_BUFFER>
     VertexId     *d_origin_edges; // origin edge list keep track of e_ids
     VertexId     *d_super_edges;  // super edge list for next iteration
     VertexId     *d_col_indices;  // column indices of CSR graph (edges)
+    VertexId     *d_temp_index;   // used for storing temp index
+    Value        *d_temp_value;   // used for storing temp value
     Value        *d_reduced_vals; // store reduced minimum weights
     Value        *d_edge_weights; // store weights per edge
-    Value        *d_temp_storage; // used for storing temp arrays
-    Value        *d_tmp_storage;  // used for storing temp arrays
     SizeT        *d_supervtx_ids; // super vertex ids scanned from flags
     SizeT        *d_row_offsets;  // row offsets of CSR graph
   };
@@ -107,10 +107,7 @@ struct MSTProblem : ProblemBase<_VertexId, _SizeT, _USE_DOUBLE_BUFFER>
    * @brief MSTProblem default constructor
    */
 
-  MSTProblem():
-  nodes(0),
-  edges(0),
-  num_gpus(0) {}
+  MSTProblem(): nodes(0), edges(0), num_gpus(0) {}
 
   /**
    * @brief MSTProblem constructor
@@ -153,9 +150,9 @@ struct MSTProblem : ProblemBase<_VertexId, _SizeT, _USE_DOUBLE_BUFFER>
       if (data_slices[i]->d_keys_array)
         util::GRError(cudaFree(data_slices[i]->d_keys_array),
           "GpuSlice cudaFree  d_keys_array  failed", __FILE__, __LINE__);
-      if (data_slices[i]->d_temp_storage)
-        util::GRError(cudaFree(data_slices[i]->d_temp_storage),
-          "GpuSlice cudaFree d_temp_storage failed", __FILE__, __LINE__);
+      if (data_slices[i]->d_temp_index)
+        util::GRError(cudaFree(data_slices[i]->d_temp_index),
+          "GpuSlice cudaFree  d_temp_index  failed", __FILE__, __LINE__);
       if (data_slices[i]->d_reduced_keys)
         util::GRError(cudaFree(data_slices[i]->d_reduced_keys),
           "GpuSlice cudaFree d_reduced_keys failed", __FILE__, __LINE__);
@@ -183,9 +180,9 @@ struct MSTProblem : ProblemBase<_VertexId, _SizeT, _USE_DOUBLE_BUFFER>
       if (data_slices[i]->d_edge_flags)
         util::GRError(cudaFree(data_slices[i]->d_edge_flags),
           "GpuSlice cudaFree  d_edge_flags  failed", __FILE__, __LINE__);
-      if (data_slices[i]->d_tmp_storage)
-        util::GRError(cudaFree(data_slices[i]->d_tmp_storage),
-          "GpuSlice cudaFree d_tmp_storage  failed", __FILE__, __LINE__);
+      if (data_slices[i]->d_temp_value)
+        util::GRError(cudaFree(data_slices[i]->d_temp_value),
+          "GpuSlice cudaFree  d_temp_value  failed", __FILE__, __LINE__);
       if (data_slices[i]->d_super_edges)
         util::GRError(cudaFree(data_slices[i]->d_super_edges),
           "GpuSlice cudaFree d_super_edges  failed", __FILE__, __LINE__);
@@ -348,7 +345,7 @@ struct MSTProblem : ProblemBase<_VertexId, _SizeT, _USE_DOUBLE_BUFFER>
           __FILE__, __LINE__)) return retval;
           data_slices[0]->d_reduced_vals = d_reduced_vals;
         util::MemsetKernel<<<128, 128>>>(
-          data_slices[0]->d_reduced_vals, 0, nodes);
+          data_slices[0]->d_reduced_vals, (Value)0, nodes);
 
         unsigned int *d_flags_array;
         if (retval = util::GRError(cudaMalloc(
@@ -370,15 +367,15 @@ struct MSTProblem : ProblemBase<_VertexId, _SizeT, _USE_DOUBLE_BUFFER>
         util::MemsetKernel<<<128, 128>>>(
           data_slices[0]->d_keys_array, 0, edges);
 
-        SizeT *d_temp_storage;
+        VertexId *d_temp_index;
         if (retval = util::GRError(cudaMalloc(
-          (void**)&d_temp_storage,
-          edges * sizeof(SizeT)),
-          "MSTProblem cudaMalloc d_temp_storage Failed",
+          (void**)&d_temp_index,
+          edges * sizeof(VertexId)),
+          "MSTProblem cudaMalloc d_temp_index Failed",
           __FILE__, __LINE__)) return retval;
-          data_slices[0]->d_temp_storage = d_temp_storage;
+          data_slices[0]->d_temp_index = d_temp_index;
         util::MemsetKernel<<<128, 128>>>(
-          data_slices[0]->d_temp_storage, 0, edges);
+          data_slices[0]->d_temp_index, (VertexId)0, edges);
 
         VertexId *d_reduced_keys;
         if (retval = util::GRError(cudaMalloc(
@@ -473,15 +470,15 @@ struct MSTProblem : ProblemBase<_VertexId, _SizeT, _USE_DOUBLE_BUFFER>
         util::MemsetKernel<unsigned int><<<128, 128>>>(
           data_slices[0]->d_edge_flags, 0, edges);
 
-        Value *d_tmp_storage;
+        Value *d_temp_value;
         if (retval = util::GRError(cudaMalloc(
-          (void**)&d_tmp_storage,
+          (void**)&d_temp_value,
           edges * sizeof(Value)),
-          "MSTProblem cudaMalloc d_tmp_storage Failed",
+          "MSTProblem cudaMalloc d_temp_value Failed",
           __FILE__, __LINE__)) return retval;
-          data_slices[0]->d_tmp_storage = d_tmp_storage;
+          data_slices[0]->d_temp_value = d_temp_value;
         util::MemsetKernel<<<128, 128>>>(
-          data_slices[0]->d_tmp_storage, 0, edges);
+          data_slices[0]->d_temp_value, (Value)0, edges);
 
         data_slices[0]->d_labels = NULL;
       }
@@ -576,14 +573,14 @@ struct MSTProblem : ProblemBase<_VertexId, _SizeT, _USE_DOUBLE_BUFFER>
         data_slices[gpu]->d_keys_array = d_keys_array;
       }
 
-      if (!data_slices[gpu]->d_temp_storage)
+      if (!data_slices[gpu]->d_temp_index)
       {
-        SizeT *d_temp_storage;
+        VertexId *d_temp_index;
         if (retval = util::GRError(cudaMalloc(
-          (void**)&d_temp_storage, edges * sizeof(SizeT)),
-          "MSTProblem cudaMalloc d_temp_storage Failed",
+          (void**)&d_temp_index, edges * sizeof(VertexId)),
+          "MSTProblem cudaMalloc d_temp_index Failed",
           __FILE__, __LINE__)) return retval;
-        data_slices[gpu]->d_temp_storage = d_temp_storage;
+        data_slices[gpu]->d_temp_index = d_temp_index;
       }
 
       if (!data_slices[gpu]->d_successors)
@@ -685,14 +682,14 @@ struct MSTProblem : ProblemBase<_VertexId, _SizeT, _USE_DOUBLE_BUFFER>
         data_slices[gpu]->d_edge_flags = d_edge_flags;
       }
 
-      if (!data_slices[gpu]->d_tmp_storage)
+      if (!data_slices[gpu]->d_temp_value)
       {
-        Value *d_tmp_storage;
+        Value *d_temp_value;
         if (retval = util::GRError(cudaMalloc(
-          (void**)&d_tmp_storage, edges * sizeof(Value)),
-          "MSTProblem cudaMalloc d_tmp_storage Failed",
+          (void**)&d_temp_value, edges * sizeof(Value)),
+          "MSTProblem cudaMalloc d_temp_value Failed",
           __FILE__, __LINE__)) return retval;
-        data_slices[gpu]->d_tmp_storage = d_tmp_storage;
+        data_slices[gpu]->d_temp_value = d_temp_value;
       }
 
       data_slices[0]->d_labels = NULL;
@@ -727,4 +724,4 @@ struct MSTProblem : ProblemBase<_VertexId, _SizeT, _USE_DOUBLE_BUFFER>
 // Local Variables:
 // mode:c++
 // c-file-style: "NVIDIA"
-// End:
\ No newline at end of file
+// End:
diff --git a/gunrock/app/pr/pr_app.cu b/gunrock/app/pr/pr_app.cu
index 8a7200595..2d6d2c376 100644
--- a/gunrock/app/pr/pr_app.cu
+++ b/gunrock/app/pr/pr_app.cu
@@ -8,21 +8,19 @@
 /**
  * @file pr_app.cu
  *
- * @brief Gunrock PageRank Implementation
+ * @brief Gunrock PageRank application
  */
 
-#include <stdio.h>
 #include <gunrock/gunrock.h>
 
-// Graph construction utils
+// graph construction utilities
 #include <gunrock/graphio/market.cuh>
 
-// Page Rank includes
+// page-rank includes
 #include <gunrock/app/pr/pr_enactor.cuh>
 #include <gunrock/app/pr/pr_problem.cuh>
 #include <gunrock/app/pr/pr_functor.cuh>
 
-// Moderngpu include
 #include <moderngpu.cuh>
 
 using namespace gunrock;
@@ -31,168 +29,118 @@ using namespace gunrock::oprtr;
 using namespace gunrock::app::pr;
 
 /**
- * @brief run page rank
+ * @brief run page-rank
  *
  * @tparam VertexId
  * @tparam Value
  * @tparam SizeT
  *
- * @param[out] ggraph_out Pointer to output CSR graph
+ * @param[out] graph_o Pointer to output CSR graph
  * @param[out] node_ids Pointer to output node IDs
  * @param[out] page_rank Pointer to output PageRanks
- * @param[in] graph Reference to the CSR graph we process on
- * @param[in] source Source ID for personalized PageRank (-1 for general PageRank)
- * @param[in] delta Delta value for computing Page Rank, usually set to .85
+ * @param[in] csr Reference to the CSR graph we process on
+ * @param[in] source Source ID for personalized PR (-1 for general PageRank)
+ * @param[in] delta Delta value for computing PageRank, usually set to 0.85
  * @param[in] error Error threshold value
  * @param[in] max_iter Max iteration for Page Rank computing
  * @param[in] max_grid_size Maximum CTA occupancy
  * @param[in] num_gpus Number of GPUs
  * @param[in] context CudaContext for moderngpu to use
  */
-template <
-    typename VertexId,
-    typename Value,
-    typename SizeT >
-void run_page_rank(
-    GunrockGraph   *ggraph_out,
+template<typename VertexId, typename Value, typename SizeT>
+ void run_pagerank(
+    GRGraph        *graph_o,
     VertexId       *node_ids,
-    Value          *page_rank,
-    const Csr<VertexId, Value, SizeT> &graph,
-    const VertexId source,
+    Value          *pagerank,
+    const Csr<VertexId, Value, SizeT> &csr,
     const Value    delta,
     const Value    error,
     const SizeT    max_iter,
     const int      max_grid_size,
     const int      num_gpus,
     CudaContext&   context) {
-    typedef PRProblem <
-        VertexId,
-        SizeT,
-        Value > Problem;
-
-    // Allocate host-side label array for gpu-computed results
-    //Value    *h_rank    = (Value*)malloc(sizeof(Value) * graph.nodes);
-    //VertexId *h_node_id = (VertexId*)malloc(sizeof(VertexId) * graph.nodes);
-
-    // Allocate Page Rank enactor map
-    PREnactor<false> pr_enactor(false);
-
-    // Allocate problem on GPU
-    Problem *csr_problem = new Problem;
-    util::GRError(csr_problem->Init(
-                      false,
-                      graph,
-                      num_gpus),
-                  "PageRank Problem Initialization Failed", __FILE__, __LINE__);
-
-    // Perform PageRank
-    GpuTimer gpu_timer;
-
-    util::GRError(csr_problem->Reset(
-                      source, delta, error, pr_enactor.GetFrontierType()),
-                  "PageRank Problem Data Reset Failed", __FILE__, __LINE__);
-    gpu_timer.Start();
-    util::GRError(pr_enactor.template Enact<Problem>(
-                      context, csr_problem, max_iter, max_grid_size),
-                  "PageRank Problem Enact Failed", __FILE__, __LINE__);
-    gpu_timer.Stop();
-
-    float elapsed = gpu_timer.ElapsedMillis();
-
-    // Copy out results
-    util::GRError(csr_problem->Extract(page_rank, node_ids),
-                  "PageRank Problem Data Extraction Failed",
-                  __FILE__, __LINE__);
-
-    // Cleanup
-    if (csr_problem) delete csr_problem;
-    //if (h_node_id)   free(h_node_id);
-    //if (h_rank)      free(h_rank);
+    typedef PRProblem<VertexId, SizeT, Value> Problem;
+    PREnactor<false> enactor(false);  // PageRank enactor map
+    Problem *problem = new Problem;   // Allocate problem on GPU
 
+    util::GRError(problem->Init(false, csr, num_gpus),
+                  "PR Problem Initialization Failed", __FILE__, __LINE__);
+
+    util::GRError(problem->Reset(0, delta, error, enactor.GetFrontierType()),
+                  "PR Problem Data Reset Failed", __FILE__, __LINE__);
+
+    GpuTimer gpu_timer; float elapsed = 0.0f; gpu_timer.Start();  // start
+
+    util::GRError(enactor.template Enact<Problem>(
+                      context, problem, max_iter, max_grid_size),
+                  "PR Problem Enact Failed", __FILE__, __LINE__);
+
+    gpu_timer.Stop(); elapsed = gpu_timer.ElapsedMillis();  // elapsed time
+    printf(" device elapsed time: %.4f ms\n", elapsed);
+
+    util::GRError(problem->Extract(pagerank, node_ids),
+                  "PR Problem Extraction Failed", __FILE__, __LINE__);
+
+    if (problem) delete problem;
     cudaDeviceSynchronize();
 }
 
 /**
  * @brief dispatch function to handle data_types
  *
- * @param[out] ggraph_out output of pr problem
+ * @param[out] graph_o    output of pr problem
  * @param[out] node_ids   output of pr problem
  * @param[out] page_rank  output of pr problem
- * @param[in]  ggraph_in  GunrockGraph type input graph
- * @param[in]  pr_config  pr specific configurations
- * @param[in]  data_type  data type configurations
+ * @param[in]  graph_i    GRGraph type input graph
+ * @param[in]  config     specific configurations
+ * @param[in]  data_t     data type configurations
  * @param[in]  context    moderngpu context
  */
-void dispatch_page_rank(
-    GunrockGraph          *ggraph_out,
-    void                  *node_ids,
-    void                  *page_rank,
-    const GunrockGraph    *ggraph_in,
-    const GunrockConfig   pr_config,
-    const GunrockDataType data_type,
-    CudaContext&          context) {
-    switch (data_type.VTXID_TYPE) {
+void dispatch_pagerank(
+    GRGraph       *graph_o,
+    void          *node_ids,
+    void          *pagerank,
+    const GRGraph *graph_i,
+    const GRSetup  config,
+    const GRTypes  data_t,
+    CudaContext   &context) {
+    switch (data_t.VTXID_TYPE) {
     case VTXID_INT: {
-        switch (data_type.SIZET_TYPE) {
+        switch (data_t.SIZET_TYPE) {
         case SIZET_INT: {
-            switch (data_type.VALUE_TYPE) {
-            case VALUE_INT: {
-                // template type = <int, int, int>
+            switch (data_t.VALUE_TYPE) {
+            case VALUE_INT: {  // template type = <int, int, int>
                 printf("Not Yet Support This DataType Combination.\n");
                 break;
             }
-            case VALUE_UINT: {
-                // template type = <int, uint, int>
+            case VALUE_UINT: {  // template type = <int, uint, int>
                 printf("Not Yet Support This DataType Combination.\n");
                 break;
             }
-            case VALUE_FLOAT: {
-                // template type = <int, float, int>
+            case VALUE_FLOAT: {  // template type = <int, float, int>
                 // build input csr format graph
                 Csr<int, float, int> csr_graph(false);
-                csr_graph.nodes          = ggraph_in->num_nodes;
-                csr_graph.edges          = ggraph_in->num_edges;
-                csr_graph.row_offsets    = (int*)ggraph_in->row_offsets;
-                csr_graph.column_indices = (int*)ggraph_in->col_indices;
-
-                // page rank configurations
-                float delta         = 0.85f; //!< default delta value
-                float error         = 0.01f; //!< error threshold
-                int   max_iter      = 20;    //!< maximum number of iterations
-                int   max_grid_size = 0;     //!< 0: leave it up to the enactor
-                int   num_gpus      = 1;     //!< for multi-gpu enactor to use
-                int   src_node      = -1;    //!< source node to start
-
-                // determine source vertex to start sssp
-                switch (pr_config.src_mode) {
-                case randomize: {
-                    src_node = graphio::RandomNode(csr_graph.nodes);
-                    break;
-                }
-                case largest_degree: {
-                    int max_node = 0;
-                    src_node = csr_graph.GetNodeWithHighestDegree(max_node);
-                    break;
-                }
-                case manually: {
-                    src_node = pr_config.src_node;
-                    break;
-                }
-                default: {
-                    src_node = -1;
-                    break;
-                }
-                }
-                delta    = pr_config.delta;
-                error    = pr_config.error;
-                max_iter = pr_config.max_iter;
-
-                run_page_rank<int, float, int>(
-                    ggraph_out,
+                csr_graph.nodes          = graph_i->num_nodes;
+                csr_graph.edges          = graph_i->num_edges;
+                csr_graph.row_offsets    = (int*)graph_i->row_offsets;
+                csr_graph.column_indices = (int*)graph_i->col_indices;
+
+                // pagerank configurations
+                float delta         = 0.85f;  // default delta value
+                float error         = 0.01f;  // error threshold
+                int   max_iter      = 20;     // maximum number of iterations
+                int   max_grid_size = 0;      // 0: leave it up to the enactor
+                int   num_gpus      = 1;      // for multi-gpu enactor to use
+
+                delta    = config.delta;
+                error    = config.error;
+                max_iter = config.max_iter;
+
+                run_pagerank<int, float, int>(
+                    graph_o,
                     (int*)node_ids,
-                    (float*)page_rank,
+                    (float*)pagerank,
                     csr_graph,
-                    src_node,
                     delta,
                     error,
                     max_iter,
@@ -215,37 +163,75 @@ void dispatch_page_rank(
 }
 
 /**
- * @brief run_page_rank entry
+ * @brief run_pr entry
  *
- * @param[out] ggraph_out output of pr problem
+ * @param[out] graph_o    output of pr problem
  * @param[out] node_ids   output of pr problem
  * @param[out] page_rank  output of pr problem
- * @param[in]  ggraph_in  input graph need to process on
- * @param[in]  pr_config  gunrock primitive specific configurations
- * @param[in]  data_type  gunrock datatype struct
+ * @param[in]  graph_i    input graph need to process on
+ * @param[in]  config     gunrock primitive specific configurations
+ * @param[in]  data_t     gunrock data_t struct
  */
-void gunrock_pr_func(
-    GunrockGraph          *ggraph_out,
-    void                  *node_ids,
-    void                  *page_rank,
-    const GunrockGraph    *ggraph_in,
-    const GunrockConfig   pr_config,
-    const GunrockDataType data_type) {
-
-    // moderngpu preparations
-    int device = 0;
-    device = pr_config.device;
+void gunrock_pagerank(
+    GRGraph       *graph_o,
+    void          *node_ids,
+    void          *pagerank,
+    const GRGraph *graph_i,
+    const GRSetup  config,
+    const GRTypes  data_t) {
+    unsigned int device = 0;
+    device = config.device;
     ContextPtr context = mgpu::CreateCudaDevice(device);
+    dispatch_pagerank(
+        graph_o, node_ids, pagerank, graph_i, config, data_t, *context);
+}
 
-    // luanch dispatch function
-    dispatch_page_rank(
-        ggraph_out,
-        node_ids,
-        page_rank,
-        ggraph_in,
-        pr_config,
-        data_type,
-        *context);
+/*
+ * @brief Simple interface take in CSR arrays as input
+ * @param[out] pagerank    Return PageRank scores per node
+ * @param[in]  num_nodes   Number of nodes of the input graph
+ * @param[in]  num_edges   Number of edges of the input graph
+ * @param[in]  row_offsets CSR-formatted graph input row offsets
+ * @param[in]  col_indices CSR-formatted graph input column indices
+ * @param[in]  source      Source to begin traverse
+ */
+void pagerank(
+    int*                node_ids,
+    float*              pagerank,
+    const int           num_nodes,
+    const int           num_edges,
+    const int*          row_offsets,
+    const int*          col_indices) {
+    printf("-------------------- setting --------------------\n");
+
+    struct GRTypes data_t;            // primitive-specific data types
+    data_t.VTXID_TYPE = VTXID_INT;    // integer
+    data_t.SIZET_TYPE = SIZET_INT;    // integer
+    data_t.VALUE_TYPE = VALUE_FLOAT;  // float ranks
+
+    struct GRSetup config;     // primitive-specific configures
+    config.device    =     0;  // setting device to run
+    config.delta     = 0.85f;  // default delta value
+    config.error     = 0.01f;  // default error threshold
+    config.max_iter  =    20;  // maximum number of iterations
+
+    struct GRGraph *graph_o = (struct GRGraph*)malloc(sizeof(struct GRGraph));
+    struct GRGraph *graph_i = (struct GRGraph*)malloc(sizeof(struct GRGraph));
+
+    graph_i->num_nodes   = num_nodes;
+    graph_i->num_edges   = num_edges;
+    graph_i->row_offsets = (void*)&row_offsets[0];
+    graph_i->col_indices = (void*)&col_indices[0];
+
+    printf(" loaded %d nodes and %d edges\n", num_nodes, num_edges);
+
+    printf("-------------------- running --------------------\n");
+    gunrock_pagerank(graph_o, node_ids, pagerank, graph_i, config, data_t);
+
+    if (graph_i) free(graph_i);
+    if (graph_o) free(graph_o);
+
+    printf("------------------- completed -------------------\n");
 }
 
 // Leave this at the end of the file
diff --git a/gunrock/app/pr/pr_enactor.cuh b/gunrock/app/pr/pr_enactor.cuh
index 40b0d4ec7..fafe7ff15 100644
--- a/gunrock/app/pr/pr_enactor.cuh
+++ b/gunrock/app/pr/pr_enactor.cuh
@@ -6,11 +6,11 @@
 // ---------------------------------------------------------------------------
 
 /**
- * @file
- * pr_enactor.cuh
- *
- * @brief PR Problem Enactor
- */
+* @file
+* pr_enactor.cuh
+*
+* @brief PR Problem Enactor
+*/
 
 #pragma once
 
@@ -32,358 +32,666 @@ namespace gunrock {
 namespace app {
 namespace pr {
 
-    template <typename Problem, bool _INSTRUMENT, bool _DEBUG, bool _SIZE_CHECK> class Enactor;
+template <typename Problem, bool _INSTRUMENT, bool _DEBUG, bool _SIZE_CHECK> class Enactor;
 
-    template <typename DataSlice>
-    __global__ void Print_Const (
-        const DataSlice* const data_slice)
+template <typename DataSlice>
+__global__ void Print_Const (
+    const DataSlice* const data_slice)
+{
+    printf("delta = %f, threshold = %f, src_node = %d\n",
+            data_slice->delta, data_slice->threshold, data_slice->src_node);
+}
+
+template <
+    typename VertexId,
+    typename SizeT>
+__global__ void Mark_Queue_R0D (
+    const SizeT           num_elements,
+    const VertexId* const keys_in,
+    const SizeT*    const degrees,
+          SizeT*          marker)
+{
+    const SizeT STRIDE = gridDim.x * blockDim.x;
+    VertexId x = blockIdx.x * blockDim.x + threadIdx.x;
+
+    while ( x < num_elements)
     {
-        printf("delta = %f, threshold = %f, src_node = %d\n",
-                data_slice->delta, data_slice->threshold, data_slice->src_node);
+        VertexId key = keys_in[x];
+        //if (degrees[key] == 0) printf("d[%d @ %d]==0 \t", key, x);
+        marker[x] = degrees[key]==0? 1 :0;
+        x += STRIDE;
     }
+}
 
-    template <
-        typename VertexId,
-        typename SizeT>
-    __global__ void Mark_Queue_R0D (
-        const SizeT           num_elements,
-        const VertexId* const keys_in,
-        const SizeT*    const degrees,
-              SizeT*          marker)
-    {
-        const SizeT STRIDE = gridDim.x * blockDim.x;
-        VertexId x = blockIdx.x * blockDim.x + threadIdx.x;
+template <
+    typename VertexId,
+    typename SizeT>
+__global__ void Make_Queue_R0D (
+    const SizeT           num_elements,
+    const VertexId* const keys_in,
+    const SizeT*    const marker,
+          VertexId*       keys_out)
+{
+    const SizeT STRIDE = gridDim.x * blockDim.x;
+    VertexId x = blockIdx.x * blockDim.x + threadIdx.x;
 
-        while ( x < num_elements)
+    while (x < num_elements)
+    {
+        SizeT Mx = marker[x];
+        if ((x!=0 && marker[x-1]!=Mx)
+           ||(x==0 && Mx==1))
         {
-            VertexId key = keys_in[x];
-            //if (degrees[key] == 0) printf("d[%d @ %d]==0 \t", key, x);
-            marker[x] = degrees[key]==0? 1 :0;
-            x += STRIDE;
+            keys_out[Mx-1] = keys_in[x];
         }
+        x += STRIDE;
     }
+}
 
-    template <
-        typename VertexId,
-        typename SizeT>
-    __global__ void Make_Queue_R0D (
-        const SizeT           num_elements,
-        const VertexId* const keys_in,
-        const SizeT*    const marker,
-              VertexId*       keys_out)
+template <
+    typename VertexId,
+    typename SizeT,
+    typename Value,
+    int      NUM_VERTEX_ASSOCIATES,
+    int      NUM_VALUE__ASSOCIATES>
+__global__ void Expand_Incoming_R0D (
+    const SizeT           num_elements,
+    const VertexId* const keys_in,
+          SizeT*          degrees)
+{
+    const SizeT STRIDE = gridDim.x * blockDim.x;
+    VertexId x = blockIdx.x * blockDim.x + threadIdx.x;
+    while (x < num_elements)
     {
-        const SizeT STRIDE = gridDim.x * blockDim.x;
-        VertexId x = blockIdx.x * blockDim.x + threadIdx.x;
-
-        while (x < num_elements)
-        {
-            SizeT Mx = marker[x];
-            if ((x!=0 && marker[x-1]!=Mx)
-               ||(x==0 && Mx==1))
-            {
-                keys_out[Mx-1] = keys_in[x];
-            }
-            x += STRIDE;
-        }
+        VertexId key = keys_in[x];
+        degrees[key] = 0;
+        x += STRIDE;
     }
+}
 
-    template <
-        typename VertexId,
-        typename SizeT,
-        typename Value,
-        int      NUM_VERTEX_ASSOCIATES,
-        int      NUM_VALUE__ASSOCIATES>
-    __global__ void Expand_Incoming_R0D (
-        const SizeT           num_elements,
-        const VertexId* const keys_in,
-              SizeT*          degrees)
+template <
+    typename VertexId,
+    typename SizeT>
+__global__ void Clear_Zero_R0D (
+    const SizeT        num_elements,
+    const SizeT* const degrees_curr,
+          SizeT*       degrees_next)
+{
+    const SizeT STRIDE = gridDim.x * blockDim.x;
+    VertexId x = blockIdx.x * blockDim.x + threadIdx.x;
+    while (x < num_elements)
     {
-        const SizeT STRIDE = gridDim.x * blockDim.x;
-        VertexId x = blockIdx.x * blockDim.x + threadIdx.x;
-        while (x < num_elements)
-        {
-            VertexId key = keys_in[x];
-            degrees[key] = 0;
-            x += STRIDE;
-        }
+        if (degrees_curr[x] == 0)
+            degrees_next[x] = -1;
+        x += STRIDE;
     }
+}
 
-    template <
-        typename VertexId,
-        typename SizeT>
-    __global__ void Clear_Zero_R0D (
-        const SizeT        num_elements,
-        const SizeT* const degrees_curr,
-              SizeT*       degrees_next)
+template <
+    typename VertexId,
+    typename SizeT,
+    typename Value,
+    int      NUM_VERTEX_ASSOCIATES,
+    int      NUM_VALUE__ASSOCIATES>
+__global__ void Expand_Incoming_PR (
+    const SizeT           num_elements,
+    const VertexId* const keys_in,
+    const size_t          array_size,
+          char*           array)
+{
+    extern __shared__ char s_array[];
+    const SizeT STRIDE = gridDim.x * blockDim.x;
+    size_t offset = 0;
+    offset += sizeof(VertexId*) * NUM_VERTEX_ASSOCIATES;
+    Value** s_value__associate_in  = (Value**)&(s_array[offset]);
+    offset += sizeof(Value*   ) * NUM_VALUE__ASSOCIATES;
+    offset += sizeof(VertexId*) * NUM_VERTEX_ASSOCIATES;
+    Value** s_value__associate_org = (Value**)&(s_array[offset]);
+    SizeT x = threadIdx.x;
+    while (x < array_size)
     {
-        const SizeT STRIDE = gridDim.x * blockDim.x;
-        VertexId x = blockIdx.x * blockDim.x + threadIdx.x;
-        while (x < num_elements)
-        {
-            if (degrees_curr[x] == 0)
-                degrees_next[x] = -1;
-            x += STRIDE;
-        }
+        s_array[x] = array[x];
+        x += blockDim.x;
     }
+    __syncthreads();
 
-    template <
-        typename VertexId,
-        typename SizeT,
-        typename Value,
-        int      NUM_VERTEX_ASSOCIATES,
-        int      NUM_VALUE__ASSOCIATES>
-    __global__ void Expand_Incoming_PR (
-        const SizeT           num_elements,
-        const VertexId* const keys_in,
-        const size_t          array_size,
-              char*           array)
+    x = blockIdx.x * blockDim.x + threadIdx.x;
+    while (x < num_elements)
     {
-        extern __shared__ char s_array[];
-        const SizeT STRIDE = gridDim.x * blockDim.x;
-        size_t offset = 0;
-        offset += sizeof(VertexId*) * NUM_VERTEX_ASSOCIATES;
-        Value** s_value__associate_in  = (Value**)&(s_array[offset]);
-        offset += sizeof(Value*   ) * NUM_VALUE__ASSOCIATES;
-        offset += sizeof(VertexId*) * NUM_VERTEX_ASSOCIATES;
-        Value** s_value__associate_org = (Value**)&(s_array[offset]);
-        SizeT x = threadIdx.x;
-        while (x < array_size)
-        {
-            s_array[x] = array[x];
-            x += blockDim.x;
-        }
-        __syncthreads();
-
-        x = blockIdx.x * blockDim.x + threadIdx.x;
-        while (x < num_elements)
-        {
-            VertexId key = keys_in[x];
-            Value old_value=atomicAdd(s_value__associate_org[0] + key, s_value__associate_in[0][x]);
-            if (TO_TRACK)
-            if (to_track(key)) printf("rank[%d] = %f + %f \n", key, old_value, s_value__associate_in[0][x]);
-            x+=STRIDE;
-        }
+        VertexId key = keys_in[x];
+        Value old_value=atomicAdd(s_value__associate_org[0] + key, s_value__associate_in[0][x]);
+        if (TO_TRACK)
+        if (to_track(key)) printf("rank[%d] = %f + %f \n", key, old_value, s_value__associate_in[0][x]);
+        x+=STRIDE;
     }
+}
 
-    template <
-        typename VertexId,
-        typename SizeT>
-    __global__ void Assign_Marker_PR(
-        const SizeT     num_elements,
-        const int       num_gpus,
-        const SizeT*    markers,
-        const int*      partition_table,
-              SizeT**   key_markers)
+template <
+    typename VertexId,
+    typename SizeT>
+__global__ void Assign_Marker_PR(
+    const SizeT     num_elements,
+    const int       num_gpus,
+    const SizeT*    markers,
+    const int*      partition_table,
+          SizeT**   key_markers)
+{
+    extern __shared__ SizeT* s_marker[];
+    int   gpu = 0;
+    SizeT x = blockIdx.x * blockDim.x + threadIdx.x;
+    const SizeT STRIDE = gridDim.x * blockDim.x;
+    if (threadIdx.x < num_gpus)
+        s_marker[threadIdx.x] = key_markers[threadIdx.x];
+    __syncthreads();
+
+    while (x < num_elements)
     {
-        extern __shared__ SizeT* s_marker[];
-        int   gpu = 0;
-        SizeT x = blockIdx.x * blockDim.x + threadIdx.x;
-        const SizeT STRIDE = gridDim.x * blockDim.x;
-        if (threadIdx.x < num_gpus)
-            s_marker[threadIdx.x] = key_markers[threadIdx.x];
-        __syncthreads();
-
-        while (x < num_elements)
+        //gpu = num_gpus;
+        gpu = partition_table[x];
+        if (markers[x] != 1 && gpu != 0)
         {
-            //gpu = num_gpus;
-            gpu = partition_table[x];
-            if (markers[x] != 1 && gpu != 0)
-            {
-                gpu = num_gpus;
-            } 
-            for (int i=0; i<num_gpus; i++)
-                s_marker[i][x] = (i==gpu)?1:0;
-            x+=STRIDE;
-        }
+            gpu = num_gpus;
+        } 
+        for (int i=0; i<num_gpus; i++)
+            s_marker[i][x] = (i==gpu)?1:0;
+        x+=STRIDE;
     }
+}
 
-    template <
-        typename VertexId,
-        typename SizeT>
-    __global__ void Assign_Keys_PR (
-        const SizeT          num_elements,
-        const int            num_gpus,
-        const int*           partition_table,
-        const SizeT*         markers,
-              SizeT**        keys_markers,
-              VertexId**     keys_outs)
-    {
-        const SizeT STRIDE = gridDim.x * blockDim.x;
-        SizeT x = blockIdx.x * blockDim.x + threadIdx.x;
+template <
+    typename VertexId,
+    typename SizeT>
+__global__ void Assign_Keys_PR (
+    const SizeT          num_elements,
+    const int            num_gpus,
+    const int*           partition_table,
+    const SizeT*         markers,
+          SizeT**        keys_markers,
+          VertexId**     keys_outs)
+{
+    const SizeT STRIDE = gridDim.x * blockDim.x;
+    SizeT x = blockIdx.x * blockDim.x + threadIdx.x;
 
-        while (x < num_elements)
+    while (x < num_elements)
+    {
+        int gpu = partition_table[x];
+        if (markers[x] == 1 || gpu == 0)
         {
-            int gpu = partition_table[x];
-            if (markers[x] == 1 || gpu == 0)
-            {
-                //if (gpu > 0)
-                //{
-                    SizeT pos = keys_markers[gpu][x]-1;
-                    //printf("keys_outs[%d][%d] <- %d \t", gpu, pos, x);
-                    keys_outs[gpu][pos] = x;
-                //}
-            }
-            x+=STRIDE;
+            //if (gpu > 0)
+            //{
+                SizeT pos = keys_markers[gpu][x]-1;
+                //printf("keys_outs[%d][%d] <- %d \t", gpu, pos, x);
+                keys_outs[gpu][pos] = x;
+            //}
         }
+        x+=STRIDE;
     }
+}
 
-    template <
-        typename VertexId,
-        typename SizeT,
-        typename Value>
-    __global__ void Assign_Values_PR (
-        const SizeT           num_elements,
-        const VertexId* const keys_out,
-        const Value*    const rank_next,
-              Value*          rank_out)
-    {
-        const SizeT STRIDE = gridDim.x * blockDim.x;
-        SizeT x = blockIdx.x * blockDim.x + threadIdx.x;
+template <
+    typename VertexId,
+    typename SizeT,
+    typename Value>
+__global__ void Assign_Values_PR (
+    const SizeT           num_elements,
+    const VertexId* const keys_out,
+    const Value*    const rank_next,
+          Value*          rank_out)
+{
+    const SizeT STRIDE = gridDim.x * blockDim.x;
+    SizeT x = blockIdx.x * blockDim.x + threadIdx.x;
 
-        while (x < num_elements)
-        {
-            VertexId key = keys_out[x];
-            rank_out[x] = rank_next[key];
-            x+=STRIDE;
-        }
+    while (x < num_elements)
+    {
+        VertexId key = keys_out[x];
+        rank_out[x] = rank_next[key];
+        x+=STRIDE;
     }
+}
 
-    template <
-        typename VertexId,
-        typename SizeT,
-        typename Value>
-    __global__ void Expand_Incoming_Final (
-        const SizeT num_elements,
-        const VertexId* const keys_in,
-        const Value*    const ranks_in,
-              Value*          ranks_out)
+template <
+    typename VertexId,
+    typename SizeT,
+    typename Value>
+__global__ void Expand_Incoming_Final (
+    const SizeT num_elements,
+    const VertexId* const keys_in,
+    const Value*    const ranks_in,
+          Value*          ranks_out)
+{
+    const SizeT STRIDE = gridDim.x * blockDim.x;
+    SizeT x = blockIdx.x * blockDim.x + threadIdx.x;
+    while (x < num_elements)
     {
-        const SizeT STRIDE = gridDim.x * blockDim.x;
-        SizeT x = blockIdx.x * blockDim.x + threadIdx.x;
-        while (x < num_elements)
-        {
-            VertexId key = keys_in[x];
-            ranks_out[key] = ranks_in[x];
-            x+=STRIDE;
-        }
+        VertexId key = keys_in[x];
+        ranks_out[key] = ranks_in[x];
+        x+=STRIDE;
     }
+}
 
 template <
-    typename AdvanceKernelPolicy,
-    typename FilterKernelPolicy,
-    typename Enactor>
+typename AdvanceKernelPolicy,
+typename FilterKernelPolicy,
+typename Enactor>
 struct R0DIteration : public IterationBase <
-    AdvanceKernelPolicy, FilterKernelPolicy, Enactor,
-    false, //HAS_SUBQ
-    true,  //HAS_FULLQ
-    false, //BACKWARD
-    true,  //FORWARD
-    false> //UPDATE_PREDECESSORS
+AdvanceKernelPolicy, FilterKernelPolicy, Enactor,
+false, //HAS_SUBQ
+true,  //HAS_FULLQ
+false, //BACKWARD
+true,  //FORWARD
+false> //UPDATE_PREDECESSORS
 {
 public:
-    typedef typename Enactor::SizeT      SizeT     ;    
-    typedef typename Enactor::Value      Value     ;    
-    typedef typename Enactor::VertexId   VertexId  ;
-    typedef typename Enactor::Problem    Problem   ;
-    typedef typename Problem::DataSlice  DataSlice ;
-    typedef GraphSlice<SizeT, VertexId, Value> GraphSlice;
-    typedef RemoveZeroDegreeNodeFunctor<
-            VertexId,
-            SizeT,
-            Value,
-            Problem> RemoveZeroFunctor;
-
-    static void FullQueue_Gather(
-        int                            thread_num,
-        int                            peer_,
-        util::DoubleBuffer<SizeT, VertexId, Value>
-                                      *frontier_queue,
-        util::Array1D<SizeT, SizeT>   *scanned_edges,
-        FrontierAttribute<SizeT>      *frontier_attribute,
-        EnactorStats                  *enactor_stats,
-        DataSlice                     *data_slice,
-        DataSlice                     *d_data_slice,
-        GraphSlice                    *graph_slice,
-        util::CtaWorkProgressLifetime *work_progress,
-        ContextPtr                     context,
-        cudaStream_t                   stream)
+typedef typename Enactor::SizeT      SizeT     ;    
+typedef typename Enactor::Value      Value     ;    
+typedef typename Enactor::VertexId   VertexId  ;
+typedef typename Enactor::Problem    Problem   ;
+typedef typename Problem::DataSlice  DataSlice ;
+typedef GraphSlice<SizeT, VertexId, Value> GraphSlice;
+typedef RemoveZeroDegreeNodeFunctor<
+        VertexId,
+        SizeT,
+        Value,
+        Problem> RemoveZeroFunctor;
+
+static void FullQueue_Gather(
+    int                            thread_num,
+    int                            peer_,
+    util::DoubleBuffer<SizeT, VertexId, Value>
+                                  *frontier_queue,
+    util::Array1D<SizeT, SizeT>   *scanned_edges,
+    FrontierAttribute<SizeT>      *frontier_attribute,
+    EnactorStats                  *enactor_stats,
+    DataSlice                     *data_slice,
+    DataSlice                     *d_data_slice,
+    GraphSlice                    *graph_slice,
+    util::CtaWorkProgressLifetime *work_progress,
+    ContextPtr                     context,
+    cudaStream_t                   stream)
+{
+    if (enactor_stats->iteration == 0)
     {
-        if (enactor_stats->iteration == 0)
-        {
-            frontier_attribute->queue_reset  = true;
-            frontier_attribute->selector     = 0;
-            frontier_attribute->queue_index  = 0;
-            frontier_attribute->queue_length = data_slice->num_gpus>1 ? data_slice->local_nodes : graph_slice->nodes;
-        }
+        frontier_attribute->queue_reset  = true;
+        frontier_attribute->selector     = 0;
+        frontier_attribute->queue_index  = 0;
+        frontier_attribute->queue_length = data_slice->num_gpus>1 ? data_slice->local_nodes : graph_slice->nodes;
     }
+}
+
+static void FullQueue_Core(
+    int                            thread_num,
+    int                            peer_,
+    util::DoubleBuffer<SizeT, VertexId, Value>
+                                  *frontier_queue,
+    util::Array1D<SizeT, SizeT>   *scanned_edges,
+    FrontierAttribute<SizeT>      *frontier_attribute,
+    EnactorStats                  *enactor_stats,
+    DataSlice                     *data_slice,
+    DataSlice                     *d_data_slice,
+    GraphSlice                    *graph_slice,
+    util::CtaWorkProgressLifetime *work_progress,
+    ContextPtr                     context,
+    cudaStream_t                   stream)
+{
+    //Print_Const<DataSlice><<<1,1,0,stream>>>(d_data_slice);
+    SizeT num_valid_node = frontier_attribute->queue_length; 
+
+    //util::DisplayDeviceResults(problem->graph_slices[0]->frontier_queues.d_keys[selector],
+    //    num_elements);
+    //util::cpu_mt::PrintGPUArray<SizeT, VertexId>("keys0", frontier_queue->keys[frontier_attribute->selector].GetPointer(util::DEVICE), frontier_attribute->queue_length, thread_num, enactor_stats->iteration, peer_, stream);
+    //util::cpu_mt::PrintGPUArray<SizeT, SizeT>("degrees0", data_slice->degrees.GetPointer(util::DEVICE), graph_slice->nodes, thread_num, enactor_stats->iteration, peer_, stream);
+
+    //bool over_sized = false;
+    //if (enactor_stats->retval = Check_Size<Enactor::SIZE_CHECK, SizeT, SizeT>(
+    //    "scanned_edges", frontier_attribute->queue_length, scanned_edges, over_sized, thread_num, enactor_stats->iteration, peer_)) return;
+    //if (enactor_stats->retval = work_progress->SetQueueLength(frontier_attribute->queue_index, frontier_attribute->queue_length, false, stream)) return;
+    frontier_attribute->queue_reset = true;
+    gunrock::oprtr::advance::LaunchKernel<AdvanceKernelPolicy, Problem, RemoveZeroFunctor>(
+        enactor_stats[0],
+        frontier_attribute[0],
+        d_data_slice,
+        (VertexId*)NULL,
+        (bool*    )NULL,
+        (bool*    )NULL,
+        scanned_edges->GetPointer(util::DEVICE),
+        frontier_queue->keys[frontier_attribute->selector  ].GetPointer(util::DEVICE),// d_in_queue
+        frontier_queue->keys[frontier_attribute->selector^1].GetPointer(util::DEVICE),// d_out_queue
+        (VertexId*)NULL,
+        (VertexId*)NULL,
+        graph_slice->row_offsets   .GetPointer(util::DEVICE),
+        graph_slice->column_indices.GetPointer(util::DEVICE),
+        (SizeT*   )NULL,
+        (VertexId*)NULL,
+        graph_slice->nodes, //graph_slice->frontier_elements[frontier_attribute.selector],   // max_in_queue
+        graph_slice->edges, //graph_slice->frontier_elements[frontier_attribute.selector^1], // max_out_queue
+        work_progress[0],
+        context[0],
+        stream,
+        gunrock::oprtr::advance::V2V,
+        false,
+        false);
+
+    //if (DEBUG && (retval = util::GRError(cudaThreadSynchronize(),
+    //      "edge_map_forward::Kernel failed", __FILE__, __LINE__))) break; 
+    enactor_stats      -> Accumulate(
+        work_progress  -> GetQueueLengthPointer<unsigned int,SizeT>(frontier_attribute->queue_index+1), stream);
+
+    gunrock::oprtr::filter::Kernel<FilterKernelPolicy, Problem, RemoveZeroFunctor>
+        <<<enactor_stats->filter_grid_size, FilterKernelPolicy::THREADS, 0, stream>>>(
+        enactor_stats->iteration,
+        frontier_attribute->queue_reset,
+        frontier_attribute->queue_index,
+        frontier_attribute->queue_length,
+        frontier_queue->keys[frontier_attribute->selector  ].GetPointer(util::DEVICE),      // d_in_queue
+        NULL,
+        frontier_queue->keys[frontier_attribute->selector^1].GetPointer(util::DEVICE),    // d_out_queue
+        d_data_slice,
+        NULL,
+        work_progress[0],
+        frontier_queue->keys[frontier_attribute->selector  ].GetSize(),           // max_in_queue
+        frontier_queue->keys[frontier_attribute->selector^1].GetSize(),         // max_out_queue
+        enactor_stats->filter_kernel_stats);
+
+    //if (DEBUG && (retval = util::GRError(cudaThreadSynchronize(),
+    //      "filter::Kernel RemoveZeroFunctor failed", __FILE__, __LINE__)))
+    //    break;
+
+    Clear_Zero_R0D <SizeT, VertexId>
+        <<<128, 128, 0, stream>>> (
+        graph_slice->nodes,
+        data_slice -> degrees.GetPointer(util::DEVICE),
+        data_slice -> degrees_pong.GetPointer(util::DEVICE));
+
+    util::MemsetCopyVectorKernel<<<128, 128, 0, stream>>>(
+        data_slice->degrees.GetPointer(util::DEVICE),
+        data_slice->degrees_pong.GetPointer(util::DEVICE), graph_slice->nodes);
+
+    //util::DisplayDeviceResults(problem->data_slices[0]->d_degrees,
+    //        graph_slice->nodes);
+
+    frontier_attribute->queue_index++;
+    frontier_attribute->selector^=1;
+    if (enactor_stats->retval = work_progress->GetQueueLength(frontier_attribute->queue_index, frontier_attribute->queue_length, false, stream)) return;
+    if (enactor_stats->retval = util::GRError(cudaStreamSynchronize(stream), "cudaStreamSynchronize failed", __FILE__, __LINE__)) return;
+    //enactor_stats->total_queued[0] += frontier_attribute->queue_length;
+    //util::cpu_mt::PrintGPUArray<SizeT, VertexId>("keys1", frontier_queue->keys[frontier_attribute->selector].GetPointer(util::DEVICE), frontier_attribute->queue_length, thread_num, enactor_stats->iteration, peer_, stream);
+    //util::cpu_mt::PrintGPUArray<SizeT, SizeT>("degrees1", data_slice->degrees.GetPointer(util::DEVICE), graph_slice->nodes, thread_num, enactor_stats->iteration, peer_, stream);
+
+    if (num_valid_node == frontier_attribute->queue_length || num_valid_node==0) data_slice->to_continue = false;
+    else data_slice->to_continue = true;
+}
+
+static cudaError_t Compute_OutputLength(
+    FrontierAttribute<SizeT> *frontier_attribute,
+    SizeT       *d_offsets,
+    VertexId    *d_indices,
+    VertexId    *d_in_key_queue,
+    util::Array1D<SizeT, SizeT>       *partitioned_scanned_edges,
+    SizeT        max_in,
+    SizeT        max_out,
+    CudaContext                    &context,
+    cudaStream_t                   stream,
+    gunrock::oprtr::advance::TYPE  ADVANCE_TYPE,
+    bool                           express = false)
+{
+    cudaError_t retval = cudaSuccess;
+    bool over_sized = false;
+    if (retval = Check_Size<Enactor::SIZE_CHECK, SizeT, SizeT> (
+        "scanned_edges", frontier_attribute->queue_length, partitioned_scanned_edges, over_sized, -1, -1, -1, false)) return retval;
+    retval = gunrock::oprtr::advance::ComputeOutputLength
+        <AdvanceKernelPolicy, Problem, RemoveZeroFunctor>(
+        frontier_attribute,
+        d_offsets,
+        d_indices,
+        d_in_key_queue,
+        partitioned_scanned_edges->GetPointer(util::DEVICE),
+        max_in,
+        max_out,
+        context,
+        stream,
+        ADVANCE_TYPE,
+        express);
+    return retval;
+}    
+
+template <int NUM_VERTEX_ASSOCIATES, int NUM_VALUE__ASSOCIATES>
+static void Expand_Incoming(
+          int             grid_size,
+          int             block_size,
+          size_t          shared_size,
+          cudaStream_t    stream,
+          SizeT           &num_elements,
+    const VertexId* const keys_in,
+    util::Array1D<SizeT, VertexId>*       keys_out,
+    const size_t          array_size,
+          char*           array,
+          DataSlice*      data_slice)
+{
+    Expand_Incoming_R0D
+        <VertexId, SizeT, Value, NUM_VERTEX_ASSOCIATES, NUM_VALUE__ASSOCIATES>
+        <<<grid_size, block_size, shared_size, stream>>> (
+        num_elements,
+        keys_in,
+        data_slice->degrees.GetPointer(util::DEVICE));
+    num_elements = 0; 
+}
+
+static bool Stop_Condition(
+    EnactorStats   *enactor_stats,
+    FrontierAttribute<SizeT> *frontier_attribute,
+    util::Array1D<SizeT, DataSlice> *data_slice,
+    int num_gpus)
+{    
+    //printf("CC Stop checked\n");fflush(stdout);
+    for (int gpu = 0; gpu < num_gpus*num_gpus; gpu++)
+    if (enactor_stats[gpu].retval != cudaSuccess)
+    {    
+        printf("(CUDA error %d @ GPU %d: %s\n", enactor_stats[gpu].retval, gpu%num_gpus, cudaGetErrorString(enactor_stats[gpu].retval)); fflush(stdout);
+        return true;
+    }    
+
+    /*for (int gpu = 0; gpu< num_gpus*num_gpus; gpu++)
+    if (enactor_stats[gpu].iteration == 0)
+    {
+        printf("enactor_stats[%d].iteration ==0\n", gpu);fflush(stdout);
+        return false;
+    }*/
+
+    for (int gpu=0; gpu<num_gpus; gpu++)
+        if (data_slice[gpu]->to_continue && frontier_attribute[gpu*num_gpus].queue_length !=0)
+    {    
+        //printf("data_slice[%d]->to_continue, frontier_attribute[%d].queue_length = %d\n", gpu, gpu*num_gpus, frontier_attribute[gpu*num_gpus].queue_length);fflush(stdout);
+        return false;
+    }    
  
-    static void FullQueue_Core(
-        int                            thread_num,
-        int                            peer_,
-        util::DoubleBuffer<SizeT, VertexId, Value>
-                                      *frontier_queue,
-        util::Array1D<SizeT, SizeT>   *scanned_edges,
-        FrontierAttribute<SizeT>      *frontier_attribute,
-        EnactorStats                  *enactor_stats,
-        DataSlice                     *data_slice,
-        DataSlice                     *d_data_slice,
-        GraphSlice                    *graph_slice,
-        util::CtaWorkProgressLifetime *work_progress,
-        ContextPtr                     context,
-        cudaStream_t                   stream)
+    for (int gpu=0; gpu<num_gpus; gpu++)
+    for (int peer=1; peer<num_gpus; peer++)
+    for (int i=0; i<2; i++) 
+    if (data_slice[gpu]->in_length[i][peer]!=0)
+    {    
+        //printf("data_slice[%d]->in_length[%d][%d] = %d\n", gpu, i, peer, data_slice[gpu]->in_length[i][peer]);fflush(stdout);
+        return false;
+    }    
+
+    for (int gpu=0; gpu<num_gpus; gpu++)
+    for (int peer=1; peer<num_gpus; peer++)
+    if (data_slice[gpu]->out_length[peer]!=0) 
+    {    
+        //printf("data_slice[%d]->out_length[%d] = %d\n", gpu, peer, data_slice[gpu]->out_length[peer]); fflush(stdout);
+        return false;
+    }    
+    //printf("CC to stop\n");fflush(stdout);
+    return true;
+}    
+
+template <
+    int NUM_VERTEX_ASSOCIATES,
+    int NUM_VALUE__ASSOCIATES>
+static void Make_Output(
+    int                            thread_num,
+    SizeT                          num_elements,
+    int                            num_gpus,
+    util::DoubleBuffer<SizeT, VertexId, Value>
+                                  *frontier_queue,
+    util::Array1D<SizeT, SizeT>   *scanned_edges,
+    FrontierAttribute<SizeT>      *frontier_attribute,
+    EnactorStats                  *enactor_stats,
+    util::Array1D<SizeT, DataSlice>
+                                  *data_slice,
+    GraphSlice                    *graph_slice,
+    util::CtaWorkProgressLifetime *work_progress,
+    ContextPtr                     context,
+    cudaStream_t                   stream)
+{
+    if (num_elements == 0)
     {
-        //Print_Const<DataSlice><<<1,1,0,stream>>>(d_data_slice);
-        SizeT num_valid_node = frontier_attribute->queue_length; 
-
-        //util::DisplayDeviceResults(problem->graph_slices[0]->frontier_queues.d_keys[selector],
-        //    num_elements);
-        //util::cpu_mt::PrintGPUArray<SizeT, VertexId>("keys0", frontier_queue->keys[frontier_attribute->selector].GetPointer(util::DEVICE), frontier_attribute->queue_length, thread_num, enactor_stats->iteration, peer_, stream);
-        //util::cpu_mt::PrintGPUArray<SizeT, SizeT>("degrees0", data_slice->degrees.GetPointer(util::DEVICE), graph_slice->nodes, thread_num, enactor_stats->iteration, peer_, stream);
-
-        //bool over_sized = false;
-        //if (enactor_stats->retval = Check_Size<Enactor::SIZE_CHECK, SizeT, SizeT>(
-        //    "scanned_edges", frontier_attribute->queue_length, scanned_edges, over_sized, thread_num, enactor_stats->iteration, peer_)) return;
-        //if (enactor_stats->retval = work_progress->SetQueueLength(frontier_attribute->queue_index, frontier_attribute->queue_length, false, stream)) return;
-        frontier_attribute->queue_reset = true;
-        gunrock::oprtr::advance::LaunchKernel<AdvanceKernelPolicy, Problem, RemoveZeroFunctor>(
-            enactor_stats[0],
-            frontier_attribute[0],
-            d_data_slice,
-            (VertexId*)NULL,
-            (bool*    )NULL,
-            (bool*    )NULL,
-            scanned_edges->GetPointer(util::DEVICE),
-            frontier_queue->keys[frontier_attribute->selector  ].GetPointer(util::DEVICE),// d_in_queue
-            frontier_queue->keys[frontier_attribute->selector^1].GetPointer(util::DEVICE),// d_out_queue
-            (VertexId*)NULL,
-            (VertexId*)NULL,
-            graph_slice->row_offsets   .GetPointer(util::DEVICE),
-            graph_slice->column_indices.GetPointer(util::DEVICE),
-            (SizeT*   )NULL,
-            (VertexId*)NULL,
-            graph_slice->nodes, //graph_slice->frontier_elements[frontier_attribute.selector],   // max_in_queue
-            graph_slice->edges, //graph_slice->frontier_elements[frontier_attribute.selector^1], // max_out_queue
-            work_progress[0],
-            context[0],
-            stream,
-            gunrock::oprtr::advance::V2V,
-            false,
-            false);
+        for (int peer_ =0; peer_<num_gpus; peer_++)
+            data_slice[0]->out_length[peer_] = 0;
+        return;
+    }
+
+    int block_size = 256;
+    int grid_size  = num_elements / block_size;
+    int peer_      = 0;
+    if ((num_elements % block_size)!=0) grid_size ++;
+    if (grid_size > 512) grid_size = 512;
+   
+    //util::MemsetKernel<<<128, 128, 0, stream>>>(data_slice[0]->markers.GetPointer(util::DEVICE), 0, num_elements); 
+    Mark_Queue_R0D <VertexId, SizeT>
+        <<<grid_size, block_size, 0, stream>>> (
+        num_elements,
+        frontier_queue->keys[frontier_attribute->selector].GetPointer(util::DEVICE),
+        data_slice[0] -> degrees.GetPointer(util::DEVICE),
+        data_slice[0] -> markers.GetPointer(util::DEVICE));
+    //util::cpu_mt::PrintGPUArray("markers", data_slice[0]->markers.GetPointer(util::DEVICE), num_elements, thread_num, enactor_stats->iteration, -1, stream);
+
+    Scan<mgpu::MgpuScanTypeInc>(
+        (int*)data_slice[0] -> markers.GetPointer(util::DEVICE),
+        num_elements,
+        (int)0, mgpu::plus<int>(), (int*)0, (int*)0,
+        (int*)data_slice[0] -> markers.GetPointer(util::DEVICE),
+        context[0]);
+
+    Make_Queue_R0D <VertexId, SizeT>
+        <<<grid_size, block_size, 0, stream>>> (
+        num_elements,
+        frontier_queue->keys[frontier_attribute->selector].GetPointer(util::DEVICE),
+        data_slice[0]->markers.GetPointer(util::DEVICE),
+        data_slice[0]->keys_out[1].GetPointer(util::DEVICE));
+
+    if (!Enactor::SIZE_CHECK)
+        util::MemsetCopyVectorKernel <<<grid_size, block_size, 0, stream>>>(
+            data_slice[0]->frontier_queues[0].keys[frontier_attribute->selector].GetPointer(util::DEVICE),
+            frontier_queue->keys[frontier_attribute->selector].GetPointer(util::DEVICE),
+            num_elements);
+
+    cudaMemcpyAsync(&data_slice[0]->out_length[1], data_slice[0]->markers.GetPointer(util::DEVICE) + num_elements -1, sizeof(SizeT), cudaMemcpyDeviceToHost, stream);
+    //printf("num_lements = %d data_slice[%d]->out_length[1] = %d\n", num_elements, thread_num, data_slice[0]->out_length[1]);fflush(stdout);
+    if (enactor_stats->retval = util::GRError(cudaStreamSynchronize(stream), "cudaStramSynchronize failed", __FILE__, __LINE__)) return;
+    //printf("num_lements = %d data_slice[%d]->out_length[1] = %d\n", num_elements, thread_num, data_slice[0]->out_length[1]);fflush(stdout);
+    for (peer_ = 2; peer_ < num_gpus; peer_++)
+        data_slice[0]->out_length[peer_] = data_slice[0]->out_length[1];
+    data_slice[0]->out_length[0] = frontier_attribute->queue_length; 
+}
+
+/*static void Check_Queue_Size(
+    int                            thread_num,
+    int                            peer_,
+    SizeT                          request_length,
+    util::DoubleBuffer<SizeT, VertexId, Value>
+                                  *frontier_queue,
+    //util::Array1D<SizeT, SizeT>   *scanned_edges,
+    FrontierAttribute<SizeT>      *frontier_attribute,
+    EnactorStats                  *enactor_stats,
+    //DataSlice                     *data_slice,
+    //DataSlice                     *d_data_slice,
+    GraphSlice                    *graph_slice
+    //util::CtaWorkProgressLifetime *work_progress,
+    //ContextPtr                     context,
+    //cudaStream_t                   stream
+    )    
+{    
+    bool over_sized = false;
+    int  selector   = frontier_attribute->selector;
+    int  iteration  = enactor_stats -> iteration;
+
+    if (Enactor::DEBUG)
+        printf("%d\t %d\t %d\t queue_length = %d, output_length = %d\n",
+            thread_num, iteration, peer_,
+            frontier_queue->keys[selector^1].GetSize(),
+            request_length);fflush(stdout);
+
+    if (enactor_stats->retval = 
+        Check_Size<true, SizeT, VertexId > ("queue3", request_length, &frontier_queue->keys  [selector^1], over_sized, thread_num, iteration, peer_, false)) return; 
+    if (enactor_stats->retval = 
+        Check_Size<true, SizeT, VertexId > ("queue3", graph_slice->nodes+2, &frontier_queue->keys  [selector  ], over_sized, thread_num, iteration, peer_, true )) return; 
+    if (Problem::USE_DOUBLE_BUFFER)
+    {    
+        if (enactor_stats->retval = 
+            Check_Size<true, SizeT, Value> ("queue3", request_length, &frontier_queue->values[selector^1], over_sized, thread_num, iteration, peer_, false)) return; 
+        if (enactor_stats->retval = 
+            Check_Size<true, SizeT, Value> ("queue3", graph_slice->nodes+2, &frontier_queue->values[selector  ], over_sized, thread_num, iteration, peer_, true )) return; 
+    }    
+} */   
 
-        //if (DEBUG && (retval = util::GRError(cudaThreadSynchronize(),
-        //      "edge_map_forward::Kernel failed", __FILE__, __LINE__))) break; 
-        enactor_stats      -> Accumulate(
-            work_progress  -> GetQueueLengthPointer<unsigned int,SizeT>(frontier_attribute->queue_index+1), stream);
+}; // end R0DIteration
 
-        gunrock::oprtr::filter::Kernel<FilterKernelPolicy, Problem, RemoveZeroFunctor>
-            <<<enactor_stats->filter_grid_size, FilterKernelPolicy::THREADS, 0, stream>>>(
+template <
+typename AdvanceKernelPolicy,
+typename FilterKernelPolicy,
+typename Enactor>
+struct PRIteration : public IterationBase <
+AdvanceKernelPolicy, FilterKernelPolicy, Enactor,
+false, //HAS_SUBQ
+true,  //HAS_FULLQ
+false, //BACKWARD
+true,  //FORWARD
+false> //UPDATE_PREDECESSORS
+{
+public:
+typedef typename Enactor::SizeT      SizeT     ;    
+typedef typename Enactor::Value      Value     ;    
+typedef typename Enactor::VertexId   VertexId  ;
+typedef typename Enactor::Problem    Problem   ;
+typedef typename Problem::DataSlice  DataSlice ;
+typedef GraphSlice     <SizeT, VertexId, Value> GraphSlice;
+typedef PRFunctor      <VertexId, SizeT, Value, Problem> PrFunctor;
+typedef PRMarkerFunctor<VertexId, SizeT, Value, Problem> PrMarkerFunctor;
+
+static void FullQueue_Core(
+    int                            thread_num,
+    int                            peer_,
+    util::DoubleBuffer<SizeT, VertexId, Value>
+                                  *frontier_queue,
+    util::Array1D<SizeT, SizeT>   *scanned_edges,
+    FrontierAttribute<SizeT>      *frontier_attribute,
+    EnactorStats                  *enactor_stats,
+    DataSlice                     *data_slice,
+    DataSlice                     *d_data_slice,
+    GraphSlice                    *graph_slice,
+    util::CtaWorkProgressLifetime *work_progress,
+    ContextPtr                     context,
+    cudaStream_t                   stream)
+{
+    //Print_Const<DataSlice><<<1,1,0,stream>>>(d_data_slice);
+    //for (int i=0; i<3; i++)
+    //{
+    //if (enactor_stats -> iteration != 0 || i!=0)
+    if (enactor_stats -> iteration != 0)
+    {
+        frontier_attribute->queue_length = data_slice -> edge_map_queue_len;
+        enactor_stats->total_queued[0] += frontier_attribute->queue_length;
+
+        //printf("Filter start.\n");fflush(stdout); 
+         // filter kernel
+        gunrock::oprtr::filter::Kernel<FilterKernelPolicy, Problem, PrFunctor>
+        <<<enactor_stats->filter_grid_size, FilterKernelPolicy::THREADS, 0, stream>>>(
             enactor_stats->iteration,
             frontier_attribute->queue_reset,
             frontier_attribute->queue_index,
             frontier_attribute->queue_length,
             frontier_queue->keys[frontier_attribute->selector  ].GetPointer(util::DEVICE),      // d_in_queue
             NULL,
-            frontier_queue->keys[frontier_attribute->selector^1].GetPointer(util::DEVICE),    // d_out_queue
+            NULL,//frontier_queue->keys[frontier_attribute->selector^1].GetPointer(util::DEVICE),// d_out_queue
             d_data_slice,
             NULL,
             work_progress[0],
@@ -391,54 +699,133 @@ public:
             frontier_queue->keys[frontier_attribute->selector^1].GetSize(),         // max_out_queue
             enactor_stats->filter_kernel_stats);
 
-        //if (DEBUG && (retval = util::GRError(cudaThreadSynchronize(),
-        //      "filter::Kernel RemoveZeroFunctor failed", __FILE__, __LINE__)))
-        //    break;
+        //if (DEBUG && (retval = util::GRError(cudaThreadSynchronize(), "filter_forward::Kernel failed", __FILE__, __LINE__))) break;
+        //cudaEventQuery(throttle_event); // give host memory mapped visibility to GPU updates     
 
-        Clear_Zero_R0D <SizeT, VertexId>
-            <<<128, 128, 0, stream>>> (
-            graph_slice->nodes,
-            data_slice -> degrees.GetPointer(util::DEVICE),
-            data_slice -> degrees_pong.GetPointer(util::DEVICE));
+        //printf("Filter end.\n");fflush(stdout); 
+        //enactor_stats->iteration++;
+        frontier_attribute->queue_index++;
+
+        if (enactor_stats->retval = work_progress->GetQueueLength(frontier_attribute->queue_index, frontier_attribute->queue_length, false, stream)) return;
+        //num_elements = queue_length;
 
+        //swap rank_curr and rank_next
         util::MemsetCopyVectorKernel<<<128, 128, 0, stream>>>(
-            data_slice->degrees.GetPointer(util::DEVICE),
-            data_slice->degrees_pong.GetPointer(util::DEVICE), graph_slice->nodes);
+            data_slice->rank_curr.GetPointer(util::DEVICE),
+            data_slice->rank_next.GetPointer(util::DEVICE), 
+            graph_slice->nodes);
+        util::MemsetKernel<<<128, 128, 0, stream>>>(
+            data_slice->rank_next.GetPointer(util::DEVICE),
+            (Value)0.0, graph_slice->nodes);
+
+        if (enactor_stats->retval = util::GRError(cudaStreamSynchronize(stream), "cudaStreamSynchronize failed", __FILE__, __LINE__));
+        data_slice->PR_queue_length = frontier_attribute->queue_length;
+        //enactor_stats      -> Accumulate(
+        //    work_progress  -> GetQueueLengthPointer<unsigned int,SizeT>(frontier_attribute->queue_index), stream);
+        //printf("queue_length = %d\n", frontier_attribute->queue_length);fflush(stdout);
+        if (false) {//if (INSTRUMENT || DEBUG) {
+            //if (enactor_stats->retval = work_progress->GetQueueLength(frontier_attribute->queue_index, frontier_attribute->queue_length,false,stream)) return;
+            //enactor_stats->total_queued += frontier_attribute->queue_length;
+            //if (DEBUG) printf(", %lld", (long long) frontier_attribute.queue_length);
+            if (Enactor::INSTRUMENT) {
+                if (enactor_stats->retval = enactor_stats->filter_kernel_stats.Accumulate(
+                    enactor_stats->filter_grid_size,
+                    enactor_stats->total_runtimes,
+                    enactor_stats->total_lifetimes,
+                    false, stream)) return;
+            }
+        }
+    }
 
-        //util::DisplayDeviceResults(problem->data_slices[0]->d_degrees,
-        //        graph_slice->nodes);
+    //if (enactor_stats->retval = work_progress->SetQueueLength(frontier_attribute->queue_index, edge_map_queue_len)) return;
+    frontier_attribute->queue_length = data_slice->edge_map_queue_len;
+    //if (enactor_stats->iteration == 0) util::cpu_mt::PrintGPUArray("keys", frontier_queue->keys[frontier_attribute->selector].GetPointer(util::DEVICE), frontier_attribute->queue_length, thread_num, enactor_stats->iteration, peer_, stream);
+    //if (enactor_stats->iteration == 0) util::cpu_mt::PrintGPUArray<SizeT, SizeT>("degrees", data_slice->degrees.GetPointer(util::DEVICE), graph_slice->nodes, thread_num, enactor_stats->iteration, peer_, stream);
+    //util::cpu_mt::PrintGPUArray<SizeT, Value>("ranks", data_slice->rank_curr.GetPointer(util::DEVICE), graph_slice->nodes, thread_num, enactor_stats->iteration, peer_, stream);
+
+    //printf("Advance start.\n");fflush(stdout); 
+    // Edge Map
+    frontier_attribute->queue_reset = true;
+    gunrock::oprtr::advance::LaunchKernel<AdvanceKernelPolicy, Problem, PrFunctor>(
+        //d_done,
+        enactor_stats[0],
+        frontier_attribute[0],
+        d_data_slice,
+        (VertexId*)NULL,
+        (bool*    )NULL,
+        (bool*    )NULL,
+        scanned_edges->GetPointer(util::DEVICE),
+        frontier_queue->keys[frontier_attribute->selector  ].GetPointer(util::DEVICE), // d_in_queue
+        NULL, //frontier_queue->keys[frontier_attribute->selector^1].GetPointer(util::DEVICE), // d_out_queue
+        (VertexId*)NULL,
+        (VertexId*)NULL,
+        graph_slice->row_offsets   .GetPointer(util::DEVICE),
+        graph_slice->column_indices.GetPointer(util::DEVICE),
+        (SizeT*   )NULL,
+        (VertexId*)NULL,
+        graph_slice->nodes,  //graph_slice->frontier_elements[frontier_attribute.selector],  // max_in_queue
+        graph_slice->edges,  //graph_slice->frontier_elements[frontier_attribute.selector^1],// max_out_queue
+        work_progress[0],
+        context[0],
+        stream,
+        gunrock::oprtr::advance::V2V,
+        false,
+        false);
+    
+    if (enactor_stats->retval = work_progress->GetQueueLength(frontier_attribute->queue_index+1, frontier_attribute->queue_length, false, stream, true)) return;
+    if (enactor_stats->retval = cudaStreamSynchronize(stream)) return;
+    enactor_stats->total_queued[0] += frontier_attribute->queue_length;
+    frontier_attribute->queue_length = data_slice->edge_map_queue_len;
+    //printf("Advance end.\n");fflush(stdout); 
 
-        frontier_attribute->queue_index++;
-        frontier_attribute->selector^=1;
-        if (enactor_stats->retval = work_progress->GetQueueLength(frontier_attribute->queue_index, frontier_attribute->queue_length, false, stream)) return;
-        if (enactor_stats->retval = util::GRError(cudaStreamSynchronize(stream), "cudaStreamSynchronize failed", __FILE__, __LINE__)) return;
-        //enactor_stats->total_queued[0] += frontier_attribute->queue_length;
-        //util::cpu_mt::PrintGPUArray<SizeT, VertexId>("keys1", frontier_queue->keys[frontier_attribute->selector].GetPointer(util::DEVICE), frontier_attribute->queue_length, thread_num, enactor_stats->iteration, peer_, stream);
-        //util::cpu_mt::PrintGPUArray<SizeT, SizeT>("degrees1", data_slice->degrees.GetPointer(util::DEVICE), graph_slice->nodes, thread_num, enactor_stats->iteration, peer_, stream);
+    //if (DEBUG && (retval = util::GRError(cudaThreadSynchronize(), "edge_map_forward::Kernel failed", __FILE__, __LINE__))) break;
+    //cudaEventQuery(throttle_event);                                 // give host memory mapped visibility to GPU updates 
 
-        if (num_valid_node == frontier_attribute->queue_length || num_valid_node==0) data_slice->to_continue = false;
-        else data_slice->to_continue = true;
+    /*if (Enactor::DEBUG) {
+        if (enactor_stats->retval = work_progress->GetQueueLength(frontier_attribute->queue_index, frontier_attribute->queue_length, false, stream)) return;
     }
 
-    static cudaError_t Compute_OutputLength(
-        FrontierAttribute<SizeT> *frontier_attribute,
-        SizeT       *d_offsets,
-        VertexId    *d_indices,
-        VertexId    *d_in_key_queue,
-        util::Array1D<SizeT, SizeT>       *partitioned_scanned_edges,
-        SizeT        max_in,
-        SizeT        max_out,
-        CudaContext                    &context,
-        cudaStream_t                   stream,
-        gunrock::oprtr::advance::TYPE  ADVANCE_TYPE,
-        bool                           express = false)
+    if (Enactor::INSTRUMENT) {
+        if (enactor_stats->retval = enactor_stats->advance_kernel_stats.Accumulate(
+            enactor_stats->advance_grid_size,
+            enactor_stats->total_runtimes,
+            enactor_stats->total_lifetimes, false, stream)) return;
+    }*/
+
+    //if (done[0] == 0) break; 
+    
+    //if (enactor_stats->retval = work_progress->SetQueueLength(frontier_attribute->queue_index, edge_map_queue_len)) return;
+
+    //if (done[0] == 0 || frontier_attribute.queue_length == 0 || enactor_stats.iteration > max_iteration) break;
+
+    //if (DEBUG) printf("\n%lld", (long long) enactor_stats.iteration);
+    //}
+}
+
+static cudaError_t Compute_OutputLength(
+    FrontierAttribute<SizeT> *frontier_attribute,
+    SizeT       *d_offsets,
+    VertexId    *d_indices,
+    VertexId    *d_in_key_queue,
+    util::Array1D<SizeT,SizeT>       *partitioned_scanned_edges,
+    SizeT        max_in,
+    SizeT        max_out,
+    CudaContext                    &context,
+    cudaStream_t                   stream,
+    gunrock::oprtr::advance::TYPE  ADVANCE_TYPE,
+    bool                           express = false)
+{   
+    //printf("Compute_OutputLength start.\n");fflush(stdout);
+    cudaError_t retval = cudaSuccess;
+    if (AdvanceKernelPolicy::ADVANCE_MODE ==  gunrock::oprtr::advance::TWC_FORWARD) 
     {
-        cudaError_t retval = cudaSuccess;
+        //return retval;
+    } else {
         bool over_sized = false;
         if (retval = Check_Size<Enactor::SIZE_CHECK, SizeT, SizeT> (
             "scanned_edges", frontier_attribute->queue_length, partitioned_scanned_edges, over_sized, -1, -1, -1, false)) return retval;
         retval = gunrock::oprtr::advance::ComputeOutputLength
-            <AdvanceKernelPolicy, Problem, RemoveZeroFunctor>(
+            <AdvanceKernelPolicy, Problem, PrFunctor>(
             frontier_attribute,
             d_offsets,
             d_indices,
@@ -449,314 +836,114 @@ public:
             context,
             stream,
             ADVANCE_TYPE,
-            express);
-        return retval;
-    }    
-
-    template <int NUM_VERTEX_ASSOCIATES, int NUM_VALUE__ASSOCIATES>
-    static void Expand_Incoming(
-              int             grid_size,
-              int             block_size,
-              size_t          shared_size,
-              cudaStream_t    stream,
-              SizeT           &num_elements,
-        const VertexId* const keys_in,
-        util::Array1D<SizeT, VertexId>*       keys_out,
-        const size_t          array_size,
-              char*           array,
-              DataSlice*      data_slice)
-    {
-        Expand_Incoming_R0D
-            <VertexId, SizeT, Value, NUM_VERTEX_ASSOCIATES, NUM_VALUE__ASSOCIATES>
-            <<<grid_size, block_size, shared_size, stream>>> (
-            num_elements,
-            keys_in,
-            data_slice->degrees.GetPointer(util::DEVICE));
-        num_elements = 0; 
+            express);     
     }
-
-    static bool Stop_Condition(
-        EnactorStats   *enactor_stats,
-        FrontierAttribute<SizeT> *frontier_attribute,
-        util::Array1D<SizeT, DataSlice> *data_slice,
-        int num_gpus)
+    //printf("Compute_OutputLength end.\n");fflush(stdout); 
+    return retval;
+}    
+
+template <int NUM_VERTEX_ASSOCIATES, int NUM_VALUE__ASSOCIATES>
+static void Expand_Incoming(
+          int             grid_size,
+          int             block_size,
+          size_t          shared_size,
+          cudaStream_t    stream,
+          SizeT           &num_elements,
+    const VertexId* const keys_in,
+    util::Array1D<SizeT, VertexId>*       keys_out,
+    const size_t          array_size,
+          char*           array,
+          DataSlice*      data_slice)
+{
+    //util::cpu_mt::PrintCPUArray("Incoming_length", &num_elements, 1, data_slice->gpu_idx);
+    Expand_Incoming_PR
+        <VertexId, SizeT, Value, NUM_VERTEX_ASSOCIATES, NUM_VALUE__ASSOCIATES>
+        <<<grid_size, block_size, shared_size, stream>>> (
+        num_elements,
+        keys_in,
+        array_size,
+        array);
+    num_elements = 0; 
+}
+
+static bool Stop_Condition (
+    EnactorStats                    *enactor_stats,
+    FrontierAttribute<SizeT>        *frontier_attribute,
+    util::Array1D<SizeT, DataSlice> *data_slice,
+    int num_gpus)
+{
+    bool all_zero = true;
+    for (int gpu = 0; gpu < num_gpus*num_gpus; gpu++)
+    if (enactor_stats[gpu].retval != cudaSuccess)
     {    
-        //printf("CC Stop checked\n");fflush(stdout);
-        for (int gpu = 0; gpu < num_gpus*num_gpus; gpu++)
-        if (enactor_stats[gpu].retval != cudaSuccess)
-        {    
-            printf("(CUDA error %d @ GPU %d: %s\n", enactor_stats[gpu].retval, gpu%num_gpus, cudaGetErrorString(enactor_stats[gpu].retval)); fflush(stdout);
-            return true;
-        }    
-
-        /*for (int gpu = 0; gpu< num_gpus*num_gpus; gpu++)
-        if (enactor_stats[gpu].iteration == 0)
-        {
-            printf("enactor_stats[%d].iteration ==0\n", gpu);fflush(stdout);
-            return false;
-        }*/
-
-        for (int gpu=0; gpu<num_gpus; gpu++)
-            if (data_slice[gpu]->to_continue && frontier_attribute[gpu*num_gpus].queue_length !=0)
-        {    
-            //printf("data_slice[%d]->to_continue, frontier_attribute[%d].queue_length = %d\n", gpu, gpu*num_gpus, frontier_attribute[gpu*num_gpus].queue_length);fflush(stdout);
-            return false;
-        }    
-     
-        for (int gpu=0; gpu<num_gpus; gpu++)
-        for (int peer=1; peer<num_gpus; peer++)
-        for (int i=0; i<2; i++) 
-        if (data_slice[gpu]->in_length[i][peer]!=0)
-        {    
-            //printf("data_slice[%d]->in_length[%d][%d] = %d\n", gpu, i, peer, data_slice[gpu]->in_length[i][peer]);fflush(stdout);
-            return false;
-        }    
-
-        for (int gpu=0; gpu<num_gpus; gpu++)
-        for (int peer=1; peer<num_gpus; peer++)
-        if (data_slice[gpu]->out_length[peer]!=0) 
-        {    
-            //printf("data_slice[%d]->out_length[%d] = %d\n", gpu, peer, data_slice[gpu]->out_length[peer]); fflush(stdout);
-            return false;
-        }    
-        //printf("CC to stop\n");fflush(stdout);
+        //printf("(CUDA error %d @ GPU %d: %s\n", enactor_stats[gpu].retval, gpu%num_gpus, cudaGetErrorString(enactor_stats[gpu].retval)); fflush(stdout);
         return true;
-    }    
+    }  
 
-    template <
-        int NUM_VERTEX_ASSOCIATES,
-        int NUM_VALUE__ASSOCIATES>
-    static void Make_Output(
-        int                            thread_num,
-        SizeT                          num_elements,
-        int                            num_gpus,
-        util::DoubleBuffer<SizeT, VertexId, Value>
-                                      *frontier_queue,
-        util::Array1D<SizeT, SizeT>   *scanned_edges,
-        FrontierAttribute<SizeT>      *frontier_attribute,
-        EnactorStats                  *enactor_stats,
-        util::Array1D<SizeT, DataSlice>
-                                      *data_slice,
-        GraphSlice                    *graph_slice,
-        util::CtaWorkProgressLifetime *work_progress,
-        ContextPtr                     context,
-        cudaStream_t                   stream)
+    for (int gpu =0; gpu < num_gpus; gpu++)
+    if (data_slice[gpu]->PR_queue_length > 0) 
     {
-        if (num_elements == 0)
-        {
-            for (int peer_ =0; peer_<num_gpus; peer_++)
-                data_slice[0]->out_length[peer_] = 0;
-            return;
-        }
- 
-        int block_size = 256;
-        int grid_size  = num_elements / block_size;
-        int peer_      = 0;
-        if ((num_elements % block_size)!=0) grid_size ++;
-        if (grid_size > 512) grid_size = 512;
-       
-        //util::MemsetKernel<<<128, 128, 0, stream>>>(data_slice[0]->markers.GetPointer(util::DEVICE), 0, num_elements); 
-        Mark_Queue_R0D <VertexId, SizeT>
-            <<<grid_size, block_size, 0, stream>>> (
-            num_elements,
-            frontier_queue->keys[frontier_attribute->selector].GetPointer(util::DEVICE),
-            data_slice[0] -> degrees.GetPointer(util::DEVICE),
-            data_slice[0] -> markers.GetPointer(util::DEVICE));
-        //util::cpu_mt::PrintGPUArray("markers", data_slice[0]->markers.GetPointer(util::DEVICE), num_elements, thread_num, enactor_stats->iteration, -1, stream);
-
-        Scan<mgpu::MgpuScanTypeInc>(
-            (int*)data_slice[0] -> markers.GetPointer(util::DEVICE),
-            num_elements,
-            (int)0, mgpu::plus<int>(), (int*)0, (int*)0,
-            (int*)data_slice[0] -> markers.GetPointer(util::DEVICE),
-            context[0]);
-
-        Make_Queue_R0D <VertexId, SizeT>
-            <<<grid_size, block_size, 0, stream>>> (
-            num_elements,
-            frontier_queue->keys[frontier_attribute->selector].GetPointer(util::DEVICE),
-            data_slice[0]->markers.GetPointer(util::DEVICE),
-            data_slice[0]->keys_out[1].GetPointer(util::DEVICE));
-
-        if (!Enactor::SIZE_CHECK)
-            util::MemsetCopyVectorKernel <<<grid_size, block_size, 0, stream>>>(
-                data_slice[0]->frontier_queues[0].keys[frontier_attribute->selector].GetPointer(util::DEVICE),
-                frontier_queue->keys[frontier_attribute->selector].GetPointer(util::DEVICE),
-                num_elements);
-
-        cudaMemcpyAsync(&data_slice[0]->out_length[1], data_slice[0]->markers.GetPointer(util::DEVICE) + num_elements -1, sizeof(SizeT), cudaMemcpyDeviceToHost, stream);
-        //printf("num_lements = %d data_slice[%d]->out_length[1] = %d\n", num_elements, thread_num, data_slice[0]->out_length[1]);fflush(stdout);
-        if (enactor_stats->retval = util::GRError(cudaStreamSynchronize(stream), "cudaStramSynchronize failed", __FILE__, __LINE__)) return;
-        //printf("num_lements = %d data_slice[%d]->out_length[1] = %d\n", num_elements, thread_num, data_slice[0]->out_length[1]);fflush(stdout);
-        for (peer_ = 2; peer_ < num_gpus; peer_++)
-            data_slice[0]->out_length[peer_] = data_slice[0]->out_length[1];
-        data_slice[0]->out_length[0] = frontier_attribute->queue_length; 
+        //printf("data_slice[%d].PR_queue_length = %d\n", gpu, data_slice[gpu]->PR_queue_length);
+        all_zero = false;
     }
+    if (all_zero) return true;
 
-    /*static void Check_Queue_Size(
-        int                            thread_num,
-        int                            peer_,
-        SizeT                          request_length,
-        util::DoubleBuffer<SizeT, VertexId, Value>
-                                      *frontier_queue,
-        //util::Array1D<SizeT, SizeT>   *scanned_edges,
-        FrontierAttribute<SizeT>      *frontier_attribute,
-        EnactorStats                  *enactor_stats,
-        //DataSlice                     *data_slice,
-        //DataSlice                     *d_data_slice,
-        GraphSlice                    *graph_slice
-        //util::CtaWorkProgressLifetime *work_progress,
-        //ContextPtr                     context,
-        //cudaStream_t                   stream
-        )    
-    {    
-        bool over_sized = false;
-        int  selector   = frontier_attribute->selector;
-        int  iteration  = enactor_stats -> iteration;
-
-        if (Enactor::DEBUG)
-            printf("%d\t %d\t %d\t queue_length = %d, output_length = %d\n",
-                thread_num, iteration, peer_,
-                frontier_queue->keys[selector^1].GetSize(),
-                request_length);fflush(stdout);
-
-        if (enactor_stats->retval = 
-            Check_Size<true, SizeT, VertexId > ("queue3", request_length, &frontier_queue->keys  [selector^1], over_sized, thread_num, iteration, peer_, false)) return; 
-        if (enactor_stats->retval = 
-            Check_Size<true, SizeT, VertexId > ("queue3", graph_slice->nodes+2, &frontier_queue->keys  [selector  ], over_sized, thread_num, iteration, peer_, true )) return; 
-        if (Problem::USE_DOUBLE_BUFFER)
-        {    
-            if (enactor_stats->retval = 
-                Check_Size<true, SizeT, Value> ("queue3", request_length, &frontier_queue->values[selector^1], over_sized, thread_num, iteration, peer_, false)) return; 
-            if (enactor_stats->retval = 
-                Check_Size<true, SizeT, Value> ("queue3", graph_slice->nodes+2, &frontier_queue->values[selector  ], over_sized, thread_num, iteration, peer_, true )) return; 
-        }    
-    } */   
+    for (int gpu =0; gpu < num_gpus; gpu++)
+    if (enactor_stats[gpu * num_gpus].iteration < data_slice[0]->max_iter)
+    {
+        //printf("enactor_stats[%d].iteration = %lld\n", gpu, enactor_stats[gpu * num_gpus].iteration);
+        return false;    
+    } 
 
-}; // end R0DIteration
+    return true;
+}    
 
 template <
-    typename AdvanceKernelPolicy,
-    typename FilterKernelPolicy,
-    typename Enactor>
-struct PRIteration : public IterationBase <
-    AdvanceKernelPolicy, FilterKernelPolicy, Enactor,
-    false, //HAS_SUBQ
-    true,  //HAS_FULLQ
-    false, //BACKWARD
-    true,  //FORWARD
-    false> //UPDATE_PREDECESSORS
+    int NUM_VERTEX_ASSOCIATES,
+    int NUM_VALUE__ASSOCIATES>
+static void Make_Output(
+    int                            thread_num,
+    SizeT                          num_elements,
+    int                            num_gpus,
+    util::DoubleBuffer<SizeT, VertexId, Value>
+                                  *frontier_queue,
+    util::Array1D<SizeT, SizeT>   *scanned_edges,
+    FrontierAttribute<SizeT>      *frontier_attribute,
+    EnactorStats                  *enactor_stats,
+    util::Array1D<SizeT, DataSlice>
+                                  *data_slice,
+    GraphSlice                    *graph_slice,
+    util::CtaWorkProgressLifetime *work_progress,
+    ContextPtr                     context,
+    cudaStream_t                   stream)
 {
-public:
-    typedef typename Enactor::SizeT      SizeT     ;    
-    typedef typename Enactor::Value      Value     ;    
-    typedef typename Enactor::VertexId   VertexId  ;
-    typedef typename Enactor::Problem    Problem   ;
-    typedef typename Problem::DataSlice  DataSlice ;
-    typedef GraphSlice     <SizeT, VertexId, Value> GraphSlice;
-    typedef PRFunctor      <VertexId, SizeT, Value, Problem> PrFunctor;
-    typedef PRMarkerFunctor<VertexId, SizeT, Value, Problem> PrMarkerFunctor;
-
-    static void FullQueue_Core(
-        int                            thread_num,
-        int                            peer_,
-        util::DoubleBuffer<SizeT, VertexId, Value>
-                                      *frontier_queue,
-        util::Array1D<SizeT, SizeT>   *scanned_edges,
-        FrontierAttribute<SizeT>      *frontier_attribute,
-        EnactorStats                  *enactor_stats,
-        DataSlice                     *data_slice,
-        DataSlice                     *d_data_slice,
-        GraphSlice                    *graph_slice,
-        util::CtaWorkProgressLifetime *work_progress,
-        ContextPtr                     context,
-        cudaStream_t                   stream)
+    //printf("Make_Output entered\n");fflush(stdout);
+    int peer_      = 0;
+    int block_size = 512;
+    int grid_size  = graph_slice->nodes / block_size;
+    if ((graph_slice->nodes % block_size)!=0) grid_size ++;
+    if (grid_size > 512) grid_size = 512;
+
+    if (num_gpus > 1 && enactor_stats->iteration==0)
     {
-        //Print_Const<DataSlice><<<1,1,0,stream>>>(d_data_slice);
-        //for (int i=0; i<3; i++)
-        //{
-        //if (enactor_stats -> iteration != 0 || i!=0)
-        if (enactor_stats -> iteration != 0)
-        {
-            frontier_attribute->queue_length = data_slice -> edge_map_queue_len;
-            enactor_stats->total_queued[0] += frontier_attribute->queue_length;
-
-            //printf("Filter start.\n");fflush(stdout); 
-             // filter kernel
-            gunrock::oprtr::filter::Kernel<FilterKernelPolicy, Problem, PrFunctor>
-            <<<enactor_stats->filter_grid_size, FilterKernelPolicy::THREADS, 0, stream>>>(
-                enactor_stats->iteration,
-                frontier_attribute->queue_reset,
-                frontier_attribute->queue_index,
-                frontier_attribute->queue_length,
-                frontier_queue->keys[frontier_attribute->selector  ].GetPointer(util::DEVICE),      // d_in_queue
-                NULL,
-                NULL,//frontier_queue->keys[frontier_attribute->selector^1].GetPointer(util::DEVICE),// d_out_queue
-                d_data_slice,
-                NULL,
-                work_progress[0],
-                frontier_queue->keys[frontier_attribute->selector  ].GetSize(),           // max_in_queue
-                frontier_queue->keys[frontier_attribute->selector^1].GetSize(),         // max_out_queue
-                enactor_stats->filter_kernel_stats);
-
-            //if (DEBUG && (retval = util::GRError(cudaThreadSynchronize(), "filter_forward::Kernel failed", __FILE__, __LINE__))) break;
-            //cudaEventQuery(throttle_event); // give host memory mapped visibility to GPU updates     
-
-            //printf("Filter end.\n");fflush(stdout); 
-            //enactor_stats->iteration++;
-            frontier_attribute->queue_index++;
-
-            if (enactor_stats->retval = work_progress->GetQueueLength(frontier_attribute->queue_index, frontier_attribute->queue_length, false, stream)) return;
-            //num_elements = queue_length;
-
-            //swap rank_curr and rank_next
-            util::MemsetCopyVectorKernel<<<128, 128, 0, stream>>>(
-                data_slice->rank_curr.GetPointer(util::DEVICE),
-                data_slice->rank_next.GetPointer(util::DEVICE), 
-                graph_slice->nodes);
-            util::MemsetKernel<<<128, 128, 0, stream>>>(
-                data_slice->rank_next.GetPointer(util::DEVICE),
-                (Value)0.0, graph_slice->nodes);
-
-            if (enactor_stats->retval = util::GRError(cudaStreamSynchronize(stream), "cudaStreamSynchronize failed", __FILE__, __LINE__));
-            data_slice->PR_queue_length = frontier_attribute->queue_length;
-            //enactor_stats      -> Accumulate(
-            //    work_progress  -> GetQueueLengthPointer<unsigned int,SizeT>(frontier_attribute->queue_index), stream);
-            //printf("queue_length = %d\n", frontier_attribute->queue_length);fflush(stdout);
-            if (false) {//if (INSTRUMENT || DEBUG) {
-                //if (enactor_stats->retval = work_progress->GetQueueLength(frontier_attribute->queue_index, frontier_attribute->queue_length,false,stream)) return;
-                //enactor_stats->total_queued += frontier_attribute->queue_length;
-                //if (DEBUG) printf(", %lld", (long long) frontier_attribute.queue_length);
-                if (Enactor::INSTRUMENT) {
-                    if (enactor_stats->retval = enactor_stats->filter_kernel_stats.Accumulate(
-                        enactor_stats->filter_grid_size,
-                        enactor_stats->total_runtimes,
-                        enactor_stats->total_lifetimes,
-                        false, stream)) return;
-                }
-            }
-        }
-
-        //if (enactor_stats->retval = work_progress->SetQueueLength(frontier_attribute->queue_index, edge_map_queue_len)) return;
-        frontier_attribute->queue_length = data_slice->edge_map_queue_len;
-        //if (enactor_stats->iteration == 0) util::cpu_mt::PrintGPUArray("keys", frontier_queue->keys[frontier_attribute->selector].GetPointer(util::DEVICE), frontier_attribute->queue_length, thread_num, enactor_stats->iteration, peer_, stream);
-        //if (enactor_stats->iteration == 0) util::cpu_mt::PrintGPUArray<SizeT, SizeT>("degrees", data_slice->degrees.GetPointer(util::DEVICE), graph_slice->nodes, thread_num, enactor_stats->iteration, peer_, stream);
-        //util::cpu_mt::PrintGPUArray<SizeT, Value>("ranks", data_slice->rank_curr.GetPointer(util::DEVICE), graph_slice->nodes, thread_num, enactor_stats->iteration, peer_, stream);
-
-        //printf("Advance start.\n");fflush(stdout); 
+        util::MemsetKernel<<<grid_size, block_size, 0, stream>>>(data_slice[0]->markers.GetPointer(util::DEVICE), (SizeT)0, graph_slice->nodes);
+        frontier_attribute->queue_length = data_slice[0]->edge_map_queue_len;
+        //util::cpu_mt::PrintGPUArray("keys", frontier_queue->keys[frontier_attribute->selector].GetPointer(util::DEVICE), frontier_attribute->queue_length, thread_num, enactor_stats->iteration, -1, stream);
+        //util::cpu_mt::PrintGPUArray("row_offsets", graph_slice->row_offsets.GetPointer(util::DEVICE), graph_slice->nodes+1, thread_num, enactor_stats->iteration, -1, stream);
+        //printf("Advance start.\n");fflush(stdout);
+        frontier_attribute->queue_reset = true; 
         // Edge Map
-        frontier_attribute->queue_reset = true;
-        gunrock::oprtr::advance::LaunchKernel<AdvanceKernelPolicy, Problem, PrFunctor>(
+        gunrock::oprtr::advance::LaunchKernel<AdvanceKernelPolicy, Problem, PrMarkerFunctor>(
             //d_done,
             enactor_stats[0],
             frontier_attribute[0],
-            d_data_slice,
+            data_slice->GetPointer(util::DEVICE),
             (VertexId*)NULL,
             (bool*    )NULL,
             (bool*    )NULL,
             scanned_edges->GetPointer(util::DEVICE),
             frontier_queue->keys[frontier_attribute->selector  ].GetPointer(util::DEVICE), // d_in_queue
-            NULL, //frontier_queue->keys[frontier_attribute->selector^1].GetPointer(util::DEVICE), // d_out_queue
+            frontier_queue->keys[frontier_attribute->selector^1].GetPointer(util::DEVICE), // d_out_queue
             (VertexId*)NULL,
             (VertexId*)NULL,
             graph_slice->row_offsets   .GetPointer(util::DEVICE),
@@ -770,392 +957,205 @@ public:
             stream,
             gunrock::oprtr::advance::V2V,
             false,
-            false);
+            true);
+        //printf("Advance end.\n");fflush(stdout);
+        //util::cpu_mt::PrintGPUArray("markers", data_slice[0]->markers.GetPointer(util::DEVICE), graph_slice->nodes, thread_num, enactor_stats->iteration, -1, stream);
         
-        if (enactor_stats->retval = work_progress->GetQueueLength(frontier_attribute->queue_index+1, frontier_attribute->queue_length, false, stream, true)) return;
-        if (enactor_stats->retval = cudaStreamSynchronize(stream)) return;
-        enactor_stats->total_queued[0] += frontier_attribute->queue_length;
-        frontier_attribute->queue_length = data_slice->edge_map_queue_len;
-        //printf("Advance end.\n");fflush(stdout); 
-
-        //if (DEBUG && (retval = util::GRError(cudaThreadSynchronize(), "edge_map_forward::Kernel failed", __FILE__, __LINE__))) break;
-        //cudaEventQuery(throttle_event);                                 // give host memory mapped visibility to GPU updates 
-
-        /*if (Enactor::DEBUG) {
-            if (enactor_stats->retval = work_progress->GetQueueLength(frontier_attribute->queue_index, frontier_attribute->queue_length, false, stream)) return;
-        }
-
-        if (Enactor::INSTRUMENT) {
-            if (enactor_stats->retval = enactor_stats->advance_kernel_stats.Accumulate(
-                enactor_stats->advance_grid_size,
-                enactor_stats->total_runtimes,
-                enactor_stats->total_lifetimes, false, stream)) return;
-        }*/
-
-        //if (done[0] == 0) break; 
-        
-        //if (enactor_stats->retval = work_progress->SetQueueLength(frontier_attribute->queue_index, edge_map_queue_len)) return;
-
-        //if (done[0] == 0 || frontier_attribute.queue_length == 0 || enactor_stats.iteration > max_iteration) break;
-
-        //if (DEBUG) printf("\n%lld", (long long) enactor_stats.iteration);
-        //}
-    }
-
-    static cudaError_t Compute_OutputLength(
-        FrontierAttribute<SizeT> *frontier_attribute,
-        SizeT       *d_offsets,
-        VertexId    *d_indices,
-        VertexId    *d_in_key_queue,
-        util::Array1D<SizeT,SizeT>       *partitioned_scanned_edges,
-        SizeT        max_in,
-        SizeT        max_out,
-        CudaContext                    &context,
-        cudaStream_t                   stream,
-        gunrock::oprtr::advance::TYPE  ADVANCE_TYPE,
-        bool                           express = false)
-    {   
-        //printf("Compute_OutputLength start.\n");fflush(stdout);
-        cudaError_t retval = cudaSuccess;
-        if (AdvanceKernelPolicy::ADVANCE_MODE ==  gunrock::oprtr::advance::TWC_FORWARD) 
+        for (peer_ = 0; peer_<num_gpus; peer_++)
+            util::MemsetKernel<<<128, 128, 0, stream>>> ( data_slice[0]->keys_marker[peer_].GetPointer(util::DEVICE), 0, graph_slice->nodes);
+        Assign_Marker_PR<VertexId, SizeT>
+            <<<grid_size, block_size, num_gpus * sizeof(SizeT*), stream>>> (
+            graph_slice->nodes,
+            num_gpus,
+            data_slice[0]->markers.GetPointer(util::DEVICE),
+            graph_slice->partition_table.GetPointer(util::DEVICE),
+            data_slice[0]->keys_markers.GetPointer(util::DEVICE));
+        //for (peer_ = 0; peer_<num_gpus;peer_++)
+        //    util::cpu_mt::PrintGPUArray("keys_marker0", data_slice[0]->keys_marker[peer_].GetPointer(util::DEVICE), graph_slice->nodes, thread_num, enactor_stats->iteration, -1, stream);
+
+        for (peer_ = 0; peer_<num_gpus;peer_++)
+            Scan<mgpu::MgpuScanTypeInc>(
+                (int*)(data_slice[0]->keys_marker[peer_].GetPointer(util::DEVICE)),
+                graph_slice->nodes,
+                (int)0, mgpu::plus<int>(), (int*)0, (int*)0,
+                (int*)(data_slice[0]->keys_marker[peer_].GetPointer(util::DEVICE)),
+                context[0]);
+        //for (peer_ = 0; peer_<num_gpus;peer_++)
+        //    util::cpu_mt::PrintGPUArray("keys_marker1", data_slice[0]->keys_marker[peer_].GetPointer(util::DEVICE), graph_slice->nodes, thread_num, enactor_stats->iteration, -1, stream);
+
+        SizeT temp_length = data_slice[0]->out_length[0];
+        if (graph_slice->nodes > 0) for (peer_ = 0; peer_<num_gpus; peer_++)
         {
-            //return retval;
+            cudaMemcpyAsync(
+                &data_slice[0]->out_length[peer_], 
+                data_slice[0]->keys_marker[peer_].GetPointer(util::DEVICE) + (graph_slice->nodes -1),
+                sizeof(SizeT), cudaMemcpyDeviceToHost, stream);
         } else {
-            bool over_sized = false;
-            if (retval = Check_Size<Enactor::SIZE_CHECK, SizeT, SizeT> (
-                "scanned_edges", frontier_attribute->queue_length, partitioned_scanned_edges, over_sized, -1, -1, -1, false)) return retval;
-            retval = gunrock::oprtr::advance::ComputeOutputLength
-                <AdvanceKernelPolicy, Problem, PrFunctor>(
-                frontier_attribute,
-                d_offsets,
-                d_indices,
-                d_in_key_queue,
-                partitioned_scanned_edges->GetPointer(util::DEVICE),
-                max_in,
-                max_out,
-                context,
-                stream,
-                ADVANCE_TYPE,
-                express);     
-        }
-        //printf("Compute_OutputLength end.\n");fflush(stdout); 
-        return retval;
-    }    
-
-    template <int NUM_VERTEX_ASSOCIATES, int NUM_VALUE__ASSOCIATES>
-    static void Expand_Incoming(
-              int             grid_size,
-              int             block_size,
-              size_t          shared_size,
-              cudaStream_t    stream,
-              SizeT           &num_elements,
-        const VertexId* const keys_in,
-        util::Array1D<SizeT, VertexId>*       keys_out,
-        const size_t          array_size,
-              char*           array,
-              DataSlice*      data_slice)
-    {
-        //util::cpu_mt::PrintCPUArray("Incoming_length", &num_elements, 1, data_slice->gpu_idx);
-        Expand_Incoming_PR
-            <VertexId, SizeT, Value, NUM_VERTEX_ASSOCIATES, NUM_VALUE__ASSOCIATES>
-            <<<grid_size, block_size, shared_size, stream>>> (
-            num_elements,
-            keys_in,
-            array_size,
-            array);
-        num_elements = 0; 
-    }
-
-    static bool Stop_Condition (
-        EnactorStats                    *enactor_stats,
-        FrontierAttribute<SizeT>        *frontier_attribute,
-        util::Array1D<SizeT, DataSlice> *data_slice,
-        int num_gpus)
-    {
-        bool all_zero = true;
-        for (int gpu = 0; gpu < num_gpus*num_gpus; gpu++)
-        if (enactor_stats[gpu].retval != cudaSuccess)
-        {    
-            //printf("(CUDA error %d @ GPU %d: %s\n", enactor_stats[gpu].retval, gpu%num_gpus, cudaGetErrorString(enactor_stats[gpu].retval)); fflush(stdout);
-            return true;
-        }  
-
-        for (int gpu =0; gpu < num_gpus; gpu++)
-        if (data_slice[gpu]->PR_queue_length > 0) 
-        {
-            //printf("data_slice[%d].PR_queue_length = %d\n", gpu, data_slice[gpu]->PR_queue_length);
-            all_zero = false;
+            for (peer_ = 1; peer_<num_gpus; peer_++)
+                data_slice[0]->out_length[peer_] = 0;
         }
-        if (all_zero) return true;
+        if (enactor_stats->retval = cudaStreamSynchronize(stream)) return;
 
-        for (int gpu =0; gpu < num_gpus; gpu++)
-        if (enactor_stats[gpu * num_gpus].iteration < data_slice[0]->max_iter)
+        for (peer_ = 0; peer_<num_gpus; peer_++)
         {
-            //printf("enactor_stats[%d].iteration = %lld\n", gpu, enactor_stats[gpu * num_gpus].iteration);
-            return false;    
+            bool over_sized = false;
+            if (peer_>1) {
+                data_slice[0]->keys_out[peer_] = data_slice[0]->temp_keys_out[peer_];
+                data_slice[0]->temp_keys_out[peer_] = util::Array1D<SizeT, VertexId>();
+            }
+            if (enactor_stats->retval = Check_Size<Enactor::SIZE_CHECK, SizeT, VertexId> (
+                "keys_out", data_slice[0]->out_length[peer_], &data_slice[0]->keys_out[peer_], over_sized, thread_num, enactor_stats->iteration, peer_)) return;
+            if (peer_>0)
+                if (enactor_stats->retval = Check_Size<Enactor::SIZE_CHECK, SizeT, Value> (
+                    "values_out", data_slice[0]->out_length[peer_], &data_slice[0]->value__associate_out[peer_][0], over_sized, thread_num, enactor_stats->iteration, peer_)) return;
+            data_slice[0]->keys_outs[peer_] = data_slice[0]->keys_out[peer_].GetPointer(util::DEVICE);
+            if (!over_sized) continue;
+            data_slice[0]->value__associate_outs[peer_][0] = data_slice[0]->value__associate_out[peer_][0].GetPointer(util::DEVICE);
+            data_slice[0]->value__associate_outs[peer_].Move(util::HOST, util::DEVICE, -1, 0, stream);
         } 
+        data_slice[0]->keys_outs.Move(util::HOST, util::DEVICE, -1, 0, stream);
+        data_slice[0]->out_length[0] = temp_length;
 
-        return true;
-    }    
+        Assign_Keys_PR <VertexId, SizeT>
+            <<<grid_size, block_size, num_gpus * sizeof(SizeT*) *2, stream>>> (
+            graph_slice->nodes,
+            num_gpus,
+            graph_slice->partition_table.GetPointer(util::DEVICE),
+            data_slice[0]->markers      .GetPointer(util::DEVICE),
+            data_slice[0]->keys_markers .GetPointer(util::DEVICE),
+            data_slice[0]->keys_outs    .GetPointer(util::DEVICE));
+            
+        //util::cpu_mt::PrintCPUArray("out_length", &data_slice[0]->out_length[0], num_gpus, thread_num, enactor_stats->iteration);
+        //for (peer_ = 0; peer_<num_gpus; peer_++)
+        //    util::cpu_mt::PrintGPUArray("keys_out[]", data_slice[0]->keys_out[peer_].GetPointer(util::DEVICE), data_slice[0]->out_length[peer_], thread_num, enactor_stats->iteration, peer_, stream);
+    }
 
-    template <
-        int NUM_VERTEX_ASSOCIATES,
-        int NUM_VALUE__ASSOCIATES>
-    static void Make_Output(
-        int                            thread_num,
-        SizeT                          num_elements,
-        int                            num_gpus,
-        util::DoubleBuffer<SizeT, VertexId, Value>
-                                      *frontier_queue,
-        util::Array1D<SizeT, SizeT>   *scanned_edges,
-        FrontierAttribute<SizeT>      *frontier_attribute,
-        EnactorStats                  *enactor_stats,
-        util::Array1D<SizeT, DataSlice>
-                                      *data_slice,
-        GraphSlice                    *graph_slice,
-        util::CtaWorkProgressLifetime *work_progress,
-        ContextPtr                     context,
-        cudaStream_t                   stream)
+    for (peer_ = 1; peer_ < num_gpus; peer_ ++)
     {
-        //printf("Make_Output entered\n");fflush(stdout);
-        int peer_      = 0;
-        int block_size = 512;
-        int grid_size  = graph_slice->nodes / block_size;
-        if ((graph_slice->nodes % block_size)!=0) grid_size ++;
-        if (grid_size > 512) grid_size = 512;
-
-        if (num_gpus > 1 && enactor_stats->iteration==0)
-        {
-            util::MemsetKernel<<<grid_size, block_size, 0, stream>>>(data_slice[0]->markers.GetPointer(util::DEVICE), (SizeT)0, graph_slice->nodes);
-            frontier_attribute->queue_length = data_slice[0]->edge_map_queue_len;
-            //util::cpu_mt::PrintGPUArray("keys", frontier_queue->keys[frontier_attribute->selector].GetPointer(util::DEVICE), frontier_attribute->queue_length, thread_num, enactor_stats->iteration, -1, stream);
-            //util::cpu_mt::PrintGPUArray("row_offsets", graph_slice->row_offsets.GetPointer(util::DEVICE), graph_slice->nodes+1, thread_num, enactor_stats->iteration, -1, stream);
-            //printf("Advance start.\n");fflush(stdout);
-            frontier_attribute->queue_reset = true; 
-            // Edge Map
-            gunrock::oprtr::advance::LaunchKernel<AdvanceKernelPolicy, Problem, PrMarkerFunctor>(
-                //d_done,
-                enactor_stats[0],
-                frontier_attribute[0],
-                data_slice->GetPointer(util::DEVICE),
-                (VertexId*)NULL,
-                (bool*    )NULL,
-                (bool*    )NULL,
-                scanned_edges->GetPointer(util::DEVICE),
-                frontier_queue->keys[frontier_attribute->selector  ].GetPointer(util::DEVICE), // d_in_queue
-                frontier_queue->keys[frontier_attribute->selector^1].GetPointer(util::DEVICE), // d_out_queue
-                (VertexId*)NULL,
-                (VertexId*)NULL,
-                graph_slice->row_offsets   .GetPointer(util::DEVICE),
-                graph_slice->column_indices.GetPointer(util::DEVICE),
-                (SizeT*   )NULL,
-                (VertexId*)NULL,
-                graph_slice->nodes,  //graph_slice->frontier_elements[frontier_attribute.selector],  // max_in_queue
-                graph_slice->edges,  //graph_slice->frontier_elements[frontier_attribute.selector^1],// max_out_queue
-                work_progress[0],
-                context[0],
-                stream,
-                gunrock::oprtr::advance::V2V,
-                false,
-                true);
-            //printf("Advance end.\n");fflush(stdout);
-            //util::cpu_mt::PrintGPUArray("markers", data_slice[0]->markers.GetPointer(util::DEVICE), graph_slice->nodes, thread_num, enactor_stats->iteration, -1, stream);
-            
-            for (peer_ = 0; peer_<num_gpus; peer_++)
-                util::MemsetKernel<<<128, 128, 0, stream>>> ( data_slice[0]->keys_marker[peer_].GetPointer(util::DEVICE), 0, graph_slice->nodes);
-            Assign_Marker_PR<VertexId, SizeT>
-                <<<grid_size, block_size, num_gpus * sizeof(SizeT*), stream>>> (
-                graph_slice->nodes,
-                num_gpus,
-                data_slice[0]->markers.GetPointer(util::DEVICE),
-                graph_slice->partition_table.GetPointer(util::DEVICE),
-                data_slice[0]->keys_markers.GetPointer(util::DEVICE));
-            //for (peer_ = 0; peer_<num_gpus;peer_++)
-            //    util::cpu_mt::PrintGPUArray("keys_marker0", data_slice[0]->keys_marker[peer_].GetPointer(util::DEVICE), graph_slice->nodes, thread_num, enactor_stats->iteration, -1, stream);
-
-            for (peer_ = 0; peer_<num_gpus;peer_++)
-                Scan<mgpu::MgpuScanTypeInc>(
-                    (int*)(data_slice[0]->keys_marker[peer_].GetPointer(util::DEVICE)),
-                    graph_slice->nodes,
-                    (int)0, mgpu::plus<int>(), (int*)0, (int*)0,
-                    (int*)(data_slice[0]->keys_marker[peer_].GetPointer(util::DEVICE)),
-                    context[0]);
-            //for (peer_ = 0; peer_<num_gpus;peer_++)
-            //    util::cpu_mt::PrintGPUArray("keys_marker1", data_slice[0]->keys_marker[peer_].GetPointer(util::DEVICE), graph_slice->nodes, thread_num, enactor_stats->iteration, -1, stream);
-
-            SizeT temp_length = data_slice[0]->out_length[0];
-            if (graph_slice->nodes > 0) for (peer_ = 0; peer_<num_gpus; peer_++)
-            {
-                cudaMemcpyAsync(
-                    &data_slice[0]->out_length[peer_], 
-                    data_slice[0]->keys_marker[peer_].GetPointer(util::DEVICE) + (graph_slice->nodes -1),
-                    sizeof(SizeT), cudaMemcpyDeviceToHost, stream);
-            } else {
-                for (peer_ = 1; peer_<num_gpus; peer_++)
-                    data_slice[0]->out_length[peer_] = 0;
-            }
-            if (enactor_stats->retval = cudaStreamSynchronize(stream)) return;
+        Assign_Values_PR <VertexId, SizeT, Value>
+            <<<grid_size, block_size, 0, stream>>> (
+            data_slice[0]->out_length[peer_],
+            data_slice[0]->keys_out[peer_].GetPointer(util::DEVICE),
+            data_slice[0]->rank_next.GetPointer(util::DEVICE),
+            data_slice[0]->value__associate_out[peer_][0].GetPointer(util::DEVICE));
+    }
+    frontier_attribute->selector = data_slice[0]->PR_queue_selector;
+    //for (peer_ = 1; peer_ < num_gpus; peer_++)
+    //{
+    //    util::cpu_mt::PrintGPUArray("values_out[]", data_slice[0]->value__associate_out[peer_][0].GetPointer(util::DEVICE), data_slice[0]->out_length[peer_], thread_num, enactor_stats->iteration, peer_, stream);
+    //}
+    if (enactor_stats->retval = cudaStreamSynchronize(stream)) return;
+}
 
-            for (peer_ = 0; peer_<num_gpus; peer_++)
-            {
-                bool over_sized = false;
-                if (peer_>1) {
-                    data_slice[0]->keys_out[peer_] = data_slice[0]->temp_keys_out[peer_];
-                    data_slice[0]->temp_keys_out[peer_] = util::Array1D<SizeT, VertexId>();
-                }
-                if (enactor_stats->retval = Check_Size<Enactor::SIZE_CHECK, SizeT, VertexId> (
-                    "keys_out", data_slice[0]->out_length[peer_], &data_slice[0]->keys_out[peer_], over_sized, thread_num, enactor_stats->iteration, peer_)) return;
-                if (peer_>0)
-                    if (enactor_stats->retval = Check_Size<Enactor::SIZE_CHECK, SizeT, Value> (
-                        "values_out", data_slice[0]->out_length[peer_], &data_slice[0]->value__associate_out[peer_][0], over_sized, thread_num, enactor_stats->iteration, peer_)) return;
-                data_slice[0]->keys_outs[peer_] = data_slice[0]->keys_out[peer_].GetPointer(util::DEVICE);
-                if (!over_sized) continue;
-                data_slice[0]->value__associate_outs[peer_][0] = data_slice[0]->value__associate_out[peer_][0].GetPointer(util::DEVICE);
-                data_slice[0]->value__associate_outs[peer_].Move(util::HOST, util::DEVICE, -1, 0, stream);
-            } 
-            data_slice[0]->keys_outs.Move(util::HOST, util::DEVICE, -1, 0, stream);
-            data_slice[0]->out_length[0] = temp_length;
-
-            Assign_Keys_PR <VertexId, SizeT>
-                <<<grid_size, block_size, num_gpus * sizeof(SizeT*) *2, stream>>> (
-                graph_slice->nodes,
-                num_gpus,
-                graph_slice->partition_table.GetPointer(util::DEVICE),
-                data_slice[0]->markers      .GetPointer(util::DEVICE),
-                data_slice[0]->keys_markers .GetPointer(util::DEVICE),
-                data_slice[0]->keys_outs    .GetPointer(util::DEVICE));
-                
-            //util::cpu_mt::PrintCPUArray("out_length", &data_slice[0]->out_length[0], num_gpus, thread_num, enactor_stats->iteration);
-            //for (peer_ = 0; peer_<num_gpus; peer_++)
-            //    util::cpu_mt::PrintGPUArray("keys_out[]", data_slice[0]->keys_out[peer_].GetPointer(util::DEVICE), data_slice[0]->out_length[peer_], thread_num, enactor_stats->iteration, peer_, stream);
-        }
+}; 
 
-        for (peer_ = 1; peer_ < num_gpus; peer_ ++)
+/**
+ * @brief Enacts a page rank computing on the specified graph.
+ *
+ * @tparam AdvanceKernelPolicy Kernel policy for advance operator.
+ * @tparam FilterKernelPolicy Kernel policy for filter operator.
+ * @tparam PRProblem PR Problem type.
+ *
+ * @param[in] context CudaContext pointer for moderngpu APIs.
+ * @param[in] problem PRProblem object.
+ * @param[in] max_iteration Maximum iteration number for PR.
+ * @param[in] max_grid_size Max grid size for PR kernel calls.
+ *
+ * \return cudaError_t object which indicates the success of all CUDA calls.
+ */
+template<
+    typename AdvanceKernelPolicy,
+    typename FilterKernelPolicy,
+    typename PrEnactor>
+static CUT_THREADPROC PRThread(
+    void * thread_data_)
+{
+    typedef typename PrEnactor::Problem    Problem;
+    typedef typename PrEnactor::SizeT      SizeT;
+    typedef typename PrEnactor::VertexId   VertexId;
+    typedef typename PrEnactor::Value      Value;
+    typedef typename Problem::DataSlice    DataSlice;
+    typedef GraphSlice<SizeT, VertexId, Value> GraphSlice;
+    typedef PRFunctor<VertexId, SizeT, Value, Problem> PrFunctor;
+    ThreadSlice  *thread_data        =  (ThreadSlice*) thread_data_;
+    Problem      *problem            =  (Problem*)     thread_data->problem;
+    PrEnactor    *enactor            =  (PrEnactor*)   thread_data->enactor;
+    //util::cpu_mt::CPUBarrier
+    //             *cpu_barrier        =   thread_data -> cpu_barrier;
+    int           num_gpus           =   problem     -> num_gpus;
+    int           thread_num         =   thread_data -> thread_num;
+    int           gpu_idx            =   problem     -> gpu_idx            [thread_num] ;
+    DataSlice    *data_slice         =   problem     -> data_slices        [thread_num].GetPointer(util::HOST);
+    GraphSlice   *graph_slice        =   problem     -> graph_slices       [thread_num] ;
+    FrontierAttribute<SizeT>
+                 *frontier_attribute = &(enactor     -> frontier_attribute [thread_num * num_gpus]);
+    EnactorStats *enactor_stats      = &(enactor     -> enactor_stats      [thread_num * num_gpus]);
+
+    do {
+        printf("CCThread entered\n");fflush(stdout);
+        if (enactor_stats[0].retval = util::SetDevice(gpu_idx)) break;
+        thread_data->stats = 1;
+        while (thread_data->stats !=2) sleep(0);
+        thread_data->stats = 3;
+
+        for (int peer_=0; peer_<num_gpus; peer_++)
         {
-            Assign_Values_PR <VertexId, SizeT, Value>
-                <<<grid_size, block_size, 0, stream>>> (
-                data_slice[0]->out_length[peer_],
-                data_slice[0]->keys_out[peer_].GetPointer(util::DEVICE),
-                data_slice[0]->rank_next.GetPointer(util::DEVICE),
-                data_slice[0]->value__associate_out[peer_][0].GetPointer(util::DEVICE));
+            frontier_attribute[peer_].queue_length  = peer_==0?data_slice->local_nodes : 0;
+            frontier_attribute[peer_].queue_index   = 0;        // Work queue index
+            frontier_attribute[peer_].selector      = 0;
+            frontier_attribute[peer_].queue_reset   = true;
+            enactor_stats     [peer_].iteration     = 0;
         }
-        frontier_attribute->selector = data_slice[0]->PR_queue_selector;
-        //for (peer_ = 1; peer_ < num_gpus; peer_++)
+        //gunrock::app::Iteration_Loop
+        //    <0, 0, PrEnactor, PrFunctor, R0DIteration<AdvanceKernelPolicy, FilterKernelPolicy, PrEnactor> > (thread_data);
+        
+        data_slice->PR_queue_selector = frontier_attribute[0].selector;
+        //for (int peer_=0; peer_<num_gpus; peer_++)
         //{
-        //    util::cpu_mt::PrintGPUArray("values_out[]", data_slice[0]->value__associate_out[peer_][0].GetPointer(util::DEVICE), data_slice[0]->out_length[peer_], thread_num, enactor_stats->iteration, peer_, stream);
+        //    frontier_attribute[peer_].queue_reset = true;
+        //    enactor_stats     [peer_].iteration   = 0;
         //}
-        if (enactor_stats->retval = cudaStreamSynchronize(stream)) return;
-    }
- 
-}; 
-
-    /**
-     * @brief Enacts a page rank computing on the specified graph.
-     *
-     * @tparam AdvanceKernelPolicy Kernel policy for advance operator.
-     * @tparam FilterKernelPolicy Kernel policy for filter operator.
-     * @tparam PRProblem PR Problem type.
-     *
-     * @param[in] context CudaContext pointer for moderngpu APIs.
-     * @param[in] problem PRProblem object.
-     * @param[in] max_iteration Maximum iteration number for PR.
-     * @param[in] max_grid_size Max grid size for PR kernel calls.
-     *
-     * \return cudaError_t object which indicates the success of all CUDA calls.
-     */
-    template<
-        typename AdvanceKernelPolicy,
-        typename FilterKernelPolicy,
-        typename PrEnactor>
-    static CUT_THREADPROC PRThread(
-        void * thread_data_)
-    {
-        typedef typename PrEnactor::Problem    Problem;
-        typedef typename PrEnactor::SizeT      SizeT;
-        typedef typename PrEnactor::VertexId   VertexId;
-        typedef typename PrEnactor::Value      Value;
-        typedef typename Problem::DataSlice    DataSlice;
-        typedef GraphSlice<SizeT, VertexId, Value> GraphSlice;
-        typedef PRFunctor<VertexId, SizeT, Value, Problem> PrFunctor;
-        ThreadSlice  *thread_data        =  (ThreadSlice*) thread_data_;
-        Problem      *problem            =  (Problem*)     thread_data->problem;
-        PrEnactor    *enactor            =  (PrEnactor*)   thread_data->enactor;
-        //util::cpu_mt::CPUBarrier
-        //             *cpu_barrier        =   thread_data -> cpu_barrier;
-        int           num_gpus           =   problem     -> num_gpus;
-        int           thread_num         =   thread_data -> thread_num;
-        int           gpu_idx            =   problem     -> gpu_idx            [thread_num] ;
-        DataSlice    *data_slice         =   problem     -> data_slices        [thread_num].GetPointer(util::HOST);
-        GraphSlice   *graph_slice        =   problem     -> graph_slices       [thread_num] ;
-        FrontierAttribute<SizeT>
-                     *frontier_attribute = &(enactor     -> frontier_attribute [thread_num * num_gpus]);
-        EnactorStats *enactor_stats      = &(enactor     -> enactor_stats      [thread_num * num_gpus]);
-
-        do {
-            printf("CCThread entered\n");fflush(stdout);
-            if (enactor_stats[0].retval = util::SetDevice(gpu_idx)) break;
-            thread_data->stats = 1;
-            while (thread_data->stats !=2) sleep(0);
-            thread_data->stats = 3;
-
-            for (int peer_=0; peer_<num_gpus; peer_++)
-            {
-                frontier_attribute[peer_].queue_length  = peer_==0?data_slice->local_nodes : 0;
-                frontier_attribute[peer_].queue_index   = 0;        // Work queue index
-                frontier_attribute[peer_].selector      = 0;
-                frontier_attribute[peer_].queue_reset   = true;
-                enactor_stats     [peer_].iteration     = 0;
-            }
-            //gunrock::app::Iteration_Loop
-            //    <0, 0, PrEnactor, PrFunctor, R0DIteration<AdvanceKernelPolicy, FilterKernelPolicy, PrEnactor> > (thread_data);
-            
-            data_slice->PR_queue_selector = frontier_attribute[0].selector;
-            //for (int peer_=0; peer_<num_gpus; peer_++)
-            //{
-            //    frontier_attribute[peer_].queue_reset = true;
-            //    enactor_stats     [peer_].iteration   = 0;
-            //}
-            if (num_gpus > 1)
-            {
-                data_slice->value__associate_orgs[0] = data_slice->rank_next.GetPointer(util::DEVICE);
-                data_slice->value__associate_orgs.Move(util::HOST, util::DEVICE);
-                //util::cpu_mt::IncrementnWaitBarrier(cpu_barrier, thread_num);
-                //for (int i=0; i<4; i++)
-                //for (int gpu=0; gpu<num_gpus; gpu++)
-                //for (int stage=0; stage<data_slice->num_stages; stage++)
-                //    data_slice->events_set[i][gpu][stage] = false;
-                //util::cpu_mt::IncrementnWaitBarrier(cpu_barrier+1, thread_num);
-            }
-            data_slice -> edge_map_queue_len = frontier_attribute[0].queue_length;
-            //util::cpu_mt::PrintGPUArray("degrees", data_slice->degrees.GetPointer(util::DEVICE), graph_slice->nodes, thread_num);
+        if (num_gpus > 1)
+        {
+            data_slice->value__associate_orgs[0] = data_slice->rank_next.GetPointer(util::DEVICE);
+            data_slice->value__associate_orgs.Move(util::HOST, util::DEVICE);
+            //util::cpu_mt::IncrementnWaitBarrier(cpu_barrier, thread_num);
+            //for (int i=0; i<4; i++)
+            //for (int gpu=0; gpu<num_gpus; gpu++)
+            //for (int stage=0; stage<data_slice->num_stages; stage++)
+            //    data_slice->events_set[i][gpu][stage] = false;
+            //util::cpu_mt::IncrementnWaitBarrier(cpu_barrier+1, thread_num);
+        }
+        data_slice -> edge_map_queue_len = frontier_attribute[0].queue_length;
+        //util::cpu_mt::PrintGPUArray("degrees", data_slice->degrees.GetPointer(util::DEVICE), graph_slice->nodes, thread_num);
 
-            // Step through PR iterations
-            gunrock::app::Iteration_Loop
-                <0, 1, PrEnactor, PrFunctor, PRIteration<AdvanceKernelPolicy, FilterKernelPolicy, PrEnactor> > (thread_data);
-            
-            if (thread_num > 0)
-            {
-                bool over_sized = false;
-                if (enactor_stats->retval = Check_Size<PrEnactor::SIZE_CHECK, SizeT, Value>(
-                    "values_out", data_slice->local_nodes, &data_slice->value__associate_out[1][0], over_sized, thread_num, enactor_stats->iteration, -1)) break;
-                if (enactor_stats->retval = Check_Size<PrEnactor::SIZE_CHECK, SizeT, VertexId>(
-                    "keys_out", data_slice->local_nodes, &data_slice->keys_out[1], over_sized, thread_num, enactor_stats->iteration, -1)) break;
-                Assign_Values_PR <VertexId, SizeT, Value>
-                    <<<128, 128, 0, data_slice->streams[0]>>> (
-                    data_slice->local_nodes,
-                    data_slice->keys_out[0].GetPointer(util::DEVICE),
-                    data_slice->rank_curr.GetPointer(util::DEVICE),
-                    data_slice->value__associate_out[1][0].GetPointer(util::DEVICE));
-                util::MemsetCopyVectorKernel<<<128, 128, 0, data_slice->streams[0]>>> (
-                    data_slice->keys_out[1].GetPointer(util::DEVICE),
-                    data_slice->keys_out[0].GetPointer(util::DEVICE),
-                    data_slice->local_nodes);
-                enactor_stats->iteration++;
-                PushNeibor <PrEnactor::SIZE_CHECK, SizeT, VertexId, Value, GraphSlice, DataSlice, 0, 1> (
-                    thread_num,
-                    0,
-                    data_slice->local_nodes,
-                    enactor_stats,
+        // Step through PR iterations
+        gunrock::app::Iteration_Loop
+            <0, 1, PrEnactor, PrFunctor, PRIteration<AdvanceKernelPolicy, FilterKernelPolicy, PrEnactor> > (thread_data);
+        
+        if (thread_num > 0)
+        {
+            bool over_sized = false;
+            if (enactor_stats->retval = Check_Size<PrEnactor::SIZE_CHECK, SizeT, Value>(
+                "values_out", data_slice->local_nodes, &data_slice->value__associate_out[1][0], over_sized, thread_num, enactor_stats->iteration, -1)) break;
+            if (enactor_stats->retval = Check_Size<PrEnactor::SIZE_CHECK, SizeT, VertexId>(
+                "keys_out", data_slice->local_nodes, &data_slice->keys_out[1], over_sized, thread_num, enactor_stats->iteration, -1)) break;
+            Assign_Values_PR <VertexId, SizeT, Value>
+                <<<128, 128, 0, data_slice->streams[0]>>> (
+                data_slice->local_nodes,
+                data_slice->keys_out[0].GetPointer(util::DEVICE),
+                data_slice->rank_curr.GetPointer(util::DEVICE),
+                data_slice->value__associate_out[1][0].GetPointer(util::DEVICE));
+            util::MemsetCopyVectorKernel<<<128, 128, 0, data_slice->streams[0]>>> (
+                data_slice->keys_out[1].GetPointer(util::DEVICE),
+                data_slice->keys_out[0].GetPointer(util::DEVICE),
+                data_slice->local_nodes);
+            enactor_stats->iteration++;
+            PushNeibor <PrEnactor::SIZE_CHECK, SizeT, VertexId, Value, GraphSlice, DataSlice, 0, 1> (
+                thread_num,
+                0,
+                data_slice->local_nodes,
+                enactor_stats,
                     problem->data_slices [thread_num].GetPointer(util::HOST),
                     problem->data_slices [0         ].GetPointer(util::HOST),
                     problem->graph_slices[thread_num],
diff --git a/gunrock/app/sssp/sssp_app.cu b/gunrock/app/sssp/sssp_app.cu
index fa55888be..c7621b8a0 100644
--- a/gunrock/app/sssp/sssp_app.cu
+++ b/gunrock/app/sssp/sssp_app.cu
@@ -8,21 +8,19 @@
 /**
  * @file sssp_app.cu
  *
- * @brief single-source shortest path problem implementation
+ * @brief single-source shortest path (SSSP) application
  */
 
-#include <stdio.h>
 #include <gunrock/gunrock.h>
 
-// Graph construction utils
+// graph construction utilities
 #include <gunrock/graphio/market.cuh>
 
-// SSSP includes
+// single-source shortest path includes
 #include <gunrock/app/sssp/sssp_enactor.cuh>
 #include <gunrock/app/sssp/sssp_problem.cuh>
 #include <gunrock/app/sssp/sssp_functor.cuh>
 
-// Moderngpu include
 #include <moderngpu.cuh>
 
 using namespace gunrock;
@@ -38,7 +36,7 @@ using namespace gunrock::app::sssp;
  * @tparam SizeT
  * @tparam MARK_PREDECESSORS
  *
- * @param[out] ggraph_out GunrockGraph type output
+ * @param[out] graph_o GRGraph type output
  * @param[out] predecessor return predeessor if mark_pred = true
  * @param[in]  graph Reference to the CSR graph we process on
  * @param[in]  source Source node where SSSP starts
@@ -48,125 +46,169 @@ using namespace gunrock::app::sssp;
  * @param[in]  delta_factor user set
  * @param[in]  context moderngpu context
  */
-template <
-    typename VertexId,
-    typename Value,
-    typename SizeT,
-    bool MARK_PREDECESSORS >
+template<typename VertexId, typename Value, typename SizeT,
+         bool MARK_PREDECESSORS>
 void run_sssp(
-    GunrockGraph   *ggraph_out,
-    VertexId       *predecessor,
-    const Csr<VertexId, Value, SizeT> &graph,
-    const VertexId source,
+    GRGraph*       graph_o,
+    VertexId*      predecessor,
+    const Csr<VertexId, Value, SizeT>& csr,
+    const VertexId src,
     const int      max_grid_size,
     const float    queue_sizing,
     const int      num_gpus,
     const int      delta_factor,
-    CudaContext& context) {
-    // Preparations
-    typedef SSSPProblem <
-        VertexId,
-        SizeT,
-        Value,
-        MARK_PREDECESSORS > Problem;
-
+    CudaContext&   context) {
+    typedef SSSPProblem<VertexId, SizeT, Value, MARK_PREDECESSORS> Problem;
     // Allocate host-side label array for gpu-computed results
-    unsigned int *h_labels
-        = (unsigned int*)malloc(sizeof(unsigned int) * graph.nodes);
+    Value *h_labels = (Value*)malloc(sizeof(Value) * csr.nodes);
     //VertexId     *h_preds  = NULL;
 
     if (MARK_PREDECESSORS) {
-        //h_preds = (VertexId*)malloc(sizeof(VertexId) * graph.nodes);
+        //h_preds = (VertexId*)malloc(sizeof(VertexId) * csr.nodes);
     }
 
-    // Allocate SSSP enactor map
-    SSSPEnactor<false> sssp_enactor(false);
+    SSSPEnactor<false> enactor(false);  // enactor map
+    Problem *problem = new Problem;
+    util::GRError(problem->Init(false, csr, num_gpus, delta_factor),
+                  "SSSP Problem Initialization Failed", __FILE__, __LINE__);
 
-    // Allocate problem on GPU
-    Problem *csr_problem = new Problem;
-    util::GRError(csr_problem->Init(
-                      false,
-                      graph,
-                      num_gpus,
-                      delta_factor),
-                  "Problem SSSP Initialization Failed", __FILE__, __LINE__);
+    util::GRError(problem->Reset(src, enactor.GetFrontierType(), queue_sizing),
+                  "SSSP Problem Data Reset Failed", __FILE__, __LINE__);
 
-    // Perform SSSP
-    CpuTimer gpu_timer;
+    GpuTimer gpu_timer; float elapsed = 0.0f; gpu_timer.Start();  // start
 
-    util::GRError(csr_problem->Reset(
-                      source, sssp_enactor.GetFrontierType(), queue_sizing),
-                  "SSSP Problem Data Reset Failed", __FILE__, __LINE__);
-    gpu_timer.Start();
-    util::GRError(sssp_enactor.template Enact<Problem>(
-                      context, csr_problem, source,
-                      queue_sizing, max_grid_size),
+    util::GRError(enactor.template Enact<Problem>(
+                      context, problem, src, queue_sizing, max_grid_size),
                   "SSSP Problem Enact Failed", __FILE__, __LINE__);
-    gpu_timer.Stop();
-    float elapsed = gpu_timer.ElapsedMillis();
 
-    // Copy out results
-    util::GRError(csr_problem->Extract(h_labels, predecessor),
-                  "SSSP Problem Data Extraction Failed", __FILE__, __LINE__);
+    gpu_timer.Stop(); elapsed = gpu_timer.ElapsedMillis();  // elapsed time
+    printf(" device elapsed time: %.4f ms\n", elapsed);
 
-    // copy label_values per node to GunrockGraph output
-    ggraph_out->node_values = (unsigned int*)&h_labels[0];
+    util::GRError(problem->Extract(h_labels, predecessor),
+                  "SSSP Problem Data Extraction Failed", __FILE__, __LINE__);
 
-    if (csr_problem) delete csr_problem;
-    //if (h_labels)    free(h_labels);
-    //if (h_preds)     free(h_preds);
+    // copy label_values per node to GRGraph output
+    graph_o->node_values = (Value*)&h_labels[0];
 
+    if (problem) { delete problem; }
     cudaDeviceSynchronize();
 }
 
 /**
  * @brief dispatch function to handle data_types
  *
- * @param[out] ggraph_out  GunrockGraph type output
- * @param[out] predecessor return predeessor if mark_pred = true
- * @param[in]  ggraph_in   GunrockGraph type input graph
- * @param[in]  sssp_config sssp specific configurations
- * @param[in]  data_type   sssp data_type configurations
- * @param[in]  context     moderngpu context
+ * @param[out] graph_o     GRGraph type output
+ * @param[out] predecessor Return predeessor if mark_pred = true
+ * @param[in]  graph_i     GRGraph type input graph
+ * @param[in]  config      Primitive-specific configurations
+ * @param[in]  data_t      Data type configurations
+ * @param[in]  context     ModernGPU context
  */
 void dispatch_sssp(
-    GunrockGraph          *ggraph_out,
-    void                  *predecessor,
-    const GunrockGraph    *ggraph_in,
-    const GunrockConfig   sssp_config,
-    const GunrockDataType data_type,
-    CudaContext&          context) {
-    switch (data_type.VTXID_TYPE) {
+    GRGraph*       graph_o,
+    void*          predecessor,
+    const GRGraph* graph_i,
+    const GRSetup  config,
+    const GRTypes  data_t,
+    CudaContext&   context) {
+    switch (data_t.VTXID_TYPE) {
     case VTXID_INT: {
-        switch (data_type.SIZET_TYPE) {
+        switch (data_t.SIZET_TYPE) {
         case SIZET_INT: {
-            switch (data_type.VALUE_TYPE) {
-            case VALUE_INT: {
-                // template type = <int, int, int>
-                // not support yet
-                printf("Not Yet Support This DataType Combination.\n");
+            switch (data_t.VALUE_TYPE) {
+            case VALUE_INT: {  // template type = <int, int, int>
+                Csr<int, int, int> csr_graph(false);
+                csr_graph.nodes          = graph_i->num_nodes;
+                csr_graph.edges          = graph_i->num_edges;
+                csr_graph.row_offsets    = (int*)graph_i->row_offsets;
+                csr_graph.column_indices = (int*)graph_i->col_indices;
+                csr_graph.edge_values    = (int*)graph_i->edge_values;
+
+                // sssp configurations
+                bool  mark_pred        =   0;  // whether to mark predecessors
+                int   src_node         =   0;  // source vertex to start
+                int   num_gpus         =   1;  // number of GPUs
+                int   delta_factor     =   1;  // default delta_factor = 1
+                int   max_grid_size    =   0;  // leave it up to the enactor
+                float max_queue_sizing = 1.0;  // default maximum queue sizing
+
+                // determine source vertex to start sssp
+                switch (config.src_mode) {
+                case randomize: {
+                    src_node = graphio::RandomNode(csr_graph.nodes);
+                    break;
+                }
+                case largest_degree: {
+                    int max_deg = 0;
+                    src_node = csr_graph.GetNodeWithHighestDegree(max_deg);
+                    break;
+                }
+                case manually: {
+                    src_node = config.src_node;
+                    break;
+                }
+                default: {
+                    src_node = 0;
+                    break;
+                }
+                }
+                mark_pred        = config.mark_pred;
+                delta_factor     = config.delta_factor;
+                max_queue_sizing = config.queue_size;
+
+                switch (mark_pred) {
+                case true: {
+                    run_sssp<int, int, int, true>(
+                        graph_o,
+                        (int*)predecessor,
+                        csr_graph,
+                        src_node,
+                        max_grid_size,
+                        max_queue_sizing,
+                        num_gpus,
+                        delta_factor,
+                        context);
+                    break;
+                }
+                case false: {
+                    run_sssp<int, int, int, false>(
+                        graph_o,
+                        (int*)predecessor,
+                        csr_graph,
+                        src_node,
+                        max_grid_size,
+                        max_queue_sizing,
+                        num_gpus,
+                        delta_factor,
+                        context);
+                    break;
+                }
+                }
+                // reset for free memory
+                csr_graph.row_offsets    = NULL;
+                csr_graph.column_indices = NULL;
+                csr_graph.edge_values    = NULL;
                 break;
             }
-            case VALUE_UINT: {
-                // template type = <int, uint, int>
+            case VALUE_UINT: {  // template type = <int, uint, int>
                 // build input csr format graph
                 Csr<int, unsigned int, int> csr_graph(false);
-                csr_graph.nodes          = ggraph_in->num_nodes;
-                csr_graph.edges          = ggraph_in->num_edges;
-                csr_graph.row_offsets    = (int*)ggraph_in->row_offsets;
-                csr_graph.column_indices = (int*)ggraph_in->col_indices;
-                csr_graph.edge_values    = (unsigned int*)ggraph_in->edge_values;
+                csr_graph.nodes          = graph_i->num_nodes;
+                csr_graph.edges          = graph_i->num_edges;
+                csr_graph.row_offsets    = (int*)graph_i->row_offsets;
+                csr_graph.column_indices = (int*)graph_i->col_indices;
+                csr_graph.edge_values    = (unsigned int*)graph_i->edge_values;
 
                 // sssp configurations
-                bool  mark_pred        = false;
-                int   src_node         = 0; //!< use whatever the specified graph-type's default is
-                int   num_gpus         = 1; //!< number of GPUs for multi-gpu enactor to use
-                int   delta_factor     = 1; //!< default delta_factor = 1
-                int   max_grid_size    = 0; //!< maximum grid size (0: leave it up to the enactor)
-                float max_queue_sizing = 1.0; //!< default maximum queue sizing
+                bool  mark_pred        =   0;  // whether to mark predecessors
+                int   src_node         =   0;  // source vertex to start
+                int   num_gpus         =   1;  // number of GPUs
+                int   delta_factor     =   1;  // default delta_factor = 1
+                int   max_grid_size    =   0;  // leave it up to the enactor
+                float max_queue_sizing = 1.0;  // default maximum queue sizing
 
                 // determine source vertex to start sssp
-                switch (sssp_config.src_mode) {
+                switch (config.src_mode) {
                 case randomize: {
                     src_node = graphio::RandomNode(csr_graph.nodes);
                     break;
@@ -177,7 +219,7 @@ void dispatch_sssp(
                     break;
                 }
                 case manually: {
-                    src_node = sssp_config.src_node;
+                    src_node = config.src_node;
                     break;
                 }
                 default: {
@@ -185,14 +227,14 @@ void dispatch_sssp(
                     break;
                 }
                 }
-                mark_pred        = sssp_config.mark_pred;
-                delta_factor     = sssp_config.delta_factor;
-                max_queue_sizing = sssp_config.queue_size;
+                mark_pred        = config.mark_pred;
+                delta_factor     = config.delta_factor;
+                max_queue_sizing = config.queue_size;
 
                 switch (mark_pred) {
                 case true: {
                     run_sssp<int, unsigned int, int, true>(
-                        ggraph_out,
+                        graph_o,
                         (int*)predecessor,
                         csr_graph,
                         src_node,
@@ -205,7 +247,7 @@ void dispatch_sssp(
                 }
                 case false: {
                     run_sssp<int, unsigned int, int, false>(
-                        ggraph_out,
+                        graph_o,
                         (int*)predecessor,
                         csr_graph,
                         src_node,
@@ -245,32 +287,75 @@ void dispatch_sssp(
  * @tparam Value
  * @tparam SizeT
  *
- * @param[out] ggraph_out  GunrockGraph type output
- * @param[out] predecessor return predeessor if mark_pred = true
- * @param[in]  ggraph_in   GunrockGraph type input graph
- * @param[in]  sssp_config gunrock primitive specific configurations
- * @param[in]  data_type   data_type configurations
+ * @param[out] graph_o     GRGraph type output
+ * @param[out] predecessor Return predeessor if mark_pred = true
+ * @param[in]  graph_i     GRGraph type input graph
+ * @param[in]  config      Primitive specific configurations
+ * @param[in]  data_t      Data type configurations
  */
-void gunrock_sssp_func(
-    GunrockGraph          *ggraph_out,
-    void                  *predecessor,
-    const GunrockGraph    *ggraph_in,
-    const GunrockConfig   sssp_config,
-    const GunrockDataType data_type) {
-
-    // moderngpu preparations
-    int device = 0;
-    device = sssp_config.device;
+void gunrock_sssp(
+    GRGraph*       graph_o,
+    void*          predecessor,
+    const GRGraph* graph_i,
+    const GRSetup  config,
+    const GRTypes  data_t) {
+    unsigned int device = 0;
+    device = config.device;
     ContextPtr context = mgpu::CreateCudaDevice(device);
+    dispatch_sssp(graph_o, predecessor, graph_i, config, data_t, *context);
+}
+
+/*
+ * @brief Simple interface take in CSR arrays as input
+ * @param[out] distances   Return shortest distance to source per nodes
+ * @param[in]  num_nodes   Number of nodes of the input graph
+ * @param[in]  num_edges   Number of edges of the input graph
+ * @param[in]  row_offsets CSR-formatted graph input row offsets
+ * @param[in]  col_indices CSR-formatted graph input column indices
+ * @param[in]  source      Source to begin traverse
+ */
+void sssp(
+    unsigned int*       distances,
+    const int           num_nodes,
+    const int           num_edges,
+    const int*          row_offsets,
+    const int*          col_indices,
+    const unsigned int* edge_values,
+    const int           source) {
+    printf("-------------------- setting --------------------\n");
+
+    struct GRTypes data_t;           // primitive-specific data types
+    data_t.VTXID_TYPE = VTXID_INT;   // integer
+    data_t.SIZET_TYPE = SIZET_INT;   // integer
+    data_t.VALUE_TYPE = VALUE_UINT;  // unsigned integer
+
+    struct GRSetup config;          // primitive-specific configures
+    config.device      =      0;    // setting device to run
+    config.src_node    = source;    // source vertex to begin
+    config.mark_pred   =  false;    // do not mark predecessors
+    config.delta_factor =    32;    // delta factor for delta-stepping
+    config.queue_size  =   1.0f;    // maximum queue size factor
+
+    struct GRGraph *graph_o = (struct GRGraph*)malloc(sizeof(struct GRGraph));
+    struct GRGraph *graph_i = (struct GRGraph*)malloc(sizeof(struct GRGraph));
+
+    graph_i->num_nodes   = num_nodes;
+    graph_i->num_edges   = num_edges;
+    graph_i->row_offsets = (void*)&row_offsets[0];
+    graph_i->col_indices = (void*)&col_indices[0];
+    graph_i->edge_values = (void*)&edge_values[0];
+
+    printf(" loaded %d nodes and %d edges\n", num_nodes, num_edges);
+
+    printf("-------------------- running --------------------\n");
+    gunrock_sssp(graph_o, (void*)NULL, graph_i, config, data_t);
+    memcpy(distances, (unsigned int*)graph_o->node_values,
+           num_nodes * sizeof(unsigned int));
+
+    if (graph_i) free(graph_i);
+    if (graph_o) free(graph_o);
 
-    // lunch dispatch function
-    dispatch_sssp(
-        ggraph_out,
-        predecessor,
-        ggraph_in,
-        sssp_config,
-        data_type,
-        *context);
+    printf("------------------- completed -------------------\n");
 }
 
 // Leave this at the end of the file
diff --git a/gunrock/app/sssp/sssp_enactor.cuh b/gunrock/app/sssp/sssp_enactor.cuh
index f8d56b085..362d90477 100644
--- a/gunrock/app/sssp/sssp_enactor.cuh
+++ b/gunrock/app/sssp/sssp_enactor.cuh
@@ -575,28 +575,6 @@ public:
     {
         clock_t      start_time = clock();
         cudaError_t  retval     = cudaSuccess;
-        
-        /*typedef PQFunctor<
-            VertexId,
-            SizeT,
-            SSSPProblem> PqFunctor;
-
-        typedef gunrock::priority_queue::PriorityQueue<
-            VertexId,
-            SizeT> NearFarPriorityQueue;
-
-        typedef gunrock::priority_queue::KernelPolicy<
-            SSSPProblem,                        // Problem data type
-            300,                                // CUDA_ARCH
-            INSTRUMENT,                         // INSTRUMENT
-            8,                                  // MIN_CTA_OCCUPANCY
-            10>                                 // LOG_THREADS
-            PriorityQueueKernelPolicy;
-
-        NearFarPriorityQueue *pq = new NearFarPriorityQueue;
-        util::GRError(
-            pq->Init(problem->graph_slices[0]->edges, queue_sizing),
-            "Priority Queue SSSP Initialization Failed", __FILE__, __LINE__);*/
 
         do {
             for (int gpu=0;gpu<this->num_gpus;gpu++)
diff --git a/gunrock/app/sssp/sssp_functor.cuh b/gunrock/app/sssp/sssp_functor.cuh
index f406de7e5..9867a8a97 100644
--- a/gunrock/app/sssp/sssp_functor.cuh
+++ b/gunrock/app/sssp/sssp_functor.cuh
@@ -149,7 +149,7 @@ struct PQFunctor
         float delta;
         util::io::ModifiedLoad<ProblemData::COLUMN_READ_MODIFIER>::Ld(
                         delta, problem->delta);
-        return (delta == 0) ? weight : weight/delta;
+        return (delta == 0) ? weight : weight / delta;
     }
 };
  
diff --git a/gunrock/app/sssp/sssp_problem.cuh b/gunrock/app/sssp/sssp_problem.cuh
index c11261cb1..edfbfe988 100644
--- a/gunrock/app/sssp/sssp_problem.cuh
+++ b/gunrock/app/sssp/sssp_problem.cuh
@@ -14,6 +14,7 @@
 
 #pragma once
 
+#include <limits>
 #include <gunrock/app/problem_base.cuh>
 #include <gunrock/util/memset_kernel.cuh>
 #include <gunrock/util/array_utils.cuh>
diff --git a/gunrock/app/topk/topk_app.cu b/gunrock/app/topk/topk_app.cu
index 5b2855259..0e38c2fcf 100644
--- a/gunrock/app/topk/topk_app.cu
+++ b/gunrock/app/topk/topk_app.cu
@@ -1,20 +1,16 @@
-// ----------------------------------------------------------------
+// ----------------------------------------------------------------------------
 // Gunrock -- Fast and Efficient GPU Graph Library
-// ----------------------------------------------------------------
+// ----------------------------------------------------------------------------
 // This source code is distributed under the terms of LICENSE.TXT
 // in the root directory of this source distribution.
-// ----------------------------------------------------------------
+// ----------------------------------------------------------------------------
 
 /**
  * @file topk_app.cu
  *
- * @brief top k degree centralities implementation
+ * @brief top k degree centralities application
  */
 
-#include <cstdlib>
-#include <stdio.h>
-#include <vector>
-#include <iostream>
 #include <gunrock/gunrock.h>
 #include <gunrock/graphio/market.cuh>
 #include <gunrock/app/topk/topk_enactor.cuh>
@@ -77,7 +73,7 @@ template <
     typename Value,
     typename SizeT >
 void build_topk_subgraph(
-    GunrockGraph *subgraph,
+    GRGraph *subgraph,
     const Csr<VertexId, Value, SizeT> &graph_original,
     const Csr<VertexId, Value, SizeT> &graph_reversed,
     VertexId  *node_ids,
@@ -173,49 +169,32 @@ template <
     typename Value,
     typename SizeT >
 void run_topk(
-    GunrockGraph *graph_out,
+    GRGraph *graph_out,
     VertexId     *node_ids,
     Value        *in_degrees,
     Value        *out_degrees,
     const Csr<VertexId, Value, SizeT> &graph_original,
     const Csr<VertexId, Value, SizeT> &graph_reversed,
     SizeT        top_nodes) {
-    // preparations
     typedef TOPKProblem<VertexId, SizeT, Value> Problem;
-    TOPKEnactor<false> topk_enactor(false);
-    Problem *topk_problem = new Problem;
-
-    // reset top_nodes if necessary
+    TOPKEnactor<false> enactor(false);
+    Problem *problem = new Problem;
     top_nodes =
         (top_nodes > graph_original.nodes) ? graph_original.nodes : top_nodes;
 
-    // initialization
-    util::GRError(topk_problem->Init(
-                      false,
-                      graph_original,
-                      graph_reversed,
-                      1),
+    util::GRError(problem->Init(false, graph_original, graph_reversed, 1),
                   "Problem TOPK Initialization Failed", __FILE__, __LINE__);
 
-    // reset data slices
-    util::GRError(topk_problem->Reset(topk_enactor.GetFrontierType()),
+    util::GRError(problem->Reset(enactor.GetFrontierType()),
                   "TOPK Problem Data Reset Failed", __FILE__, __LINE__);
 
-    // launch gpu topk enactor to calculate top k nodes
-    util::GRError(topk_enactor.template Enact<Problem>(
-                      topk_problem,
-                      top_nodes),
+    util::GRError(enactor.template Enact<Problem>(problem, top_nodes),
                   "TOPK Problem Enact Failed", __FILE__, __LINE__);
 
-    // copy out results back to cpu
-    util::GRError(topk_problem->Extract(
-                      node_ids,
-                      in_degrees,
-                      out_degrees,
-                      top_nodes),
+    util::GRError(problem->Extract(node_ids, in_degrees, out_degrees, top_nodes),
                   "TOPK Problem Data Extraction Failed", __FILE__, __LINE__);
 
-    // build a subgraph contains only top k nodes on cpu
+    // build vertex-induced subgraph contains only top k nodes
     build_topk_subgraph<VertexId, Value, SizeT>(
         graph_out,
         graph_original,
@@ -223,62 +202,54 @@ void run_topk(
         (int*)node_ids,
         top_nodes);
 
-    // cleanup if neccessary
-    if (topk_problem) { delete topk_problem; }
-
+    if (problem) { delete problem; }
     cudaDeviceSynchronize();
 }
 
 /**
  * @brief dispatch function to handle data_types
  *
- * @param[out] ggraph_out  GunrockGraph type output
+ * @param[out] graph_o     GRGraph type output
  * @param[out] node_ids    output top k node ids
  * @param[out] in_degrees  output top k in-degree centralities
  * @param[out] out_degrees output top k out-degree centralities
- * @param[in]  ggraph_in   GunrockGraph type input graph
- * @param[in]  topk_config topk specific configurations
- * @param[in]  data_type   topk data_type configurations
+ * @param[in]  graph_i     GRGraph type input graph
+ * @param[in]  config      topk specific configurations
+ * @param[in]  data_t      topk data_t configurations
  */
 void dispatch_topk(
-    GunrockGraph          *ggraph_out,
-    void                  *node_ids,
-    void                  *in_degrees,
-    void                  *out_degrees,
-    const GunrockGraph    *ggraph_in,
-    const GunrockConfig   topk_config,
-    const GunrockDataType data_type) {
-    switch (data_type.VTXID_TYPE) {
+    GRGraph       *graph_o,
+    void          *node_ids,
+    void          *in_degrees,
+    void          *out_degrees,
+    const GRGraph *graph_i,
+    const GRSetup  config,
+    const GRTypes  data_t) {
+    switch (data_t.VTXID_TYPE) {
     case VTXID_INT: {
-        switch (data_type.SIZET_TYPE) {
+        switch (data_t.SIZET_TYPE) {
         case SIZET_INT: {
-            switch (data_type.VALUE_TYPE) {
-            case VALUE_INT: {
-                // template type = <int, int, int>
-                // original graph
+            switch (data_t.VALUE_TYPE) {
+            case VALUE_INT: {  // template type = <int, int, int>
                 Csr<int, int, int> graph_original(false);
-                graph_original.nodes = ggraph_in->num_nodes;
-                graph_original.edges = ggraph_in->num_edges;
-                graph_original.row_offsets    = (int*)ggraph_in->row_offsets;
-                graph_original.column_indices = (int*)ggraph_in->col_indices;
-
-                // reversed graph
+                graph_original.nodes = graph_i->num_nodes;
+                graph_original.edges = graph_i->num_edges;
+                graph_original.row_offsets    = (int*)graph_i->row_offsets;
+                graph_original.column_indices = (int*)graph_i->col_indices;
                 Csr<int, int, int> graph_reversed(false);
-                graph_reversed.nodes = ggraph_in->num_nodes;
-                graph_reversed.edges = ggraph_in->num_edges;
-                graph_reversed.row_offsets    = (int*)ggraph_in->col_offsets;
-                graph_reversed.column_indices = (int*)ggraph_in->row_indices;
-
-                //graph_original.DisplayGraph();
+                graph_reversed.nodes = graph_i->num_nodes;
+                graph_reversed.edges = graph_i->num_edges;
+                graph_reversed.row_offsets    = (int*)graph_i->col_offsets;
+                graph_reversed.column_indices = (int*)graph_i->row_indices;
 
                 run_topk<int, int, int>(
-                    ggraph_out,
+                    graph_o,
                     (int*)node_ids,
                     (int*)in_degrees,
                     (int*)out_degrees,
                     graph_original,
                     graph_reversed,
-                    topk_config.top_nodes);
+                    config.top_nodes);
 
                 // reset for free memory
                 graph_original.row_offsets    = NULL;
@@ -287,13 +258,11 @@ void dispatch_topk(
                 graph_reversed.column_indices = NULL;
                 break;
             }
-            case VALUE_UINT: {
-                // template type = <int, uint, int>
+            case VALUE_UINT: {  // template type = <int, uint, int>
                 printf("Not Yet Support This DataType Combination.\n");
                 break;
             }
-            case VALUE_FLOAT: {
-                // template type = <int, float, int>
+            case VALUE_FLOAT: {  // template type = <int, float, int>
                 printf("Not Yet Support This DataType Combination.\n");
                 break;
             }
@@ -309,32 +278,24 @@ void dispatch_topk(
 /*
  * @brief topk dispatch function base on gunrock data types
  *
- * @param[out] ggraph_out  output subgraph of topk problem
+ * @param[out] graph_o     output subgraph of topk problem
  * @param[out] node_ids    output top k node_ids
  * @param[out] in_degrees  output associated centrality values
  * @param[out] out_degrees output associated centrality values
- * @param[in]  ggraph_in   input graph need to process on
- * @param[in]  topk_config gunrock primitive specific configurations
- * @param[in]  data_type   gunrock datatype struct
+ * @param[in]  graph_i     input graph need to process on
+ * @param[in]  config      gunrock primitive specific configurations
+ * @param[in]  data_t      gunrock data_t struct
  */
-void gunrock_topk_func(
-    GunrockGraph          *ggraph_out,
-    void                  *node_ids,
-    void                  *in_degrees,
-    void                  *out_degrees,
-    const GunrockGraph    *ggraph_in,
-    const GunrockConfig   topk_config,
-    const GunrockDataType data_type) {
-
-    // launch topk dispatch function
-    dispatch_topk(
-        ggraph_out,
-        node_ids,
-        in_degrees,
-        out_degrees,
-        ggraph_in,
-        topk_config,
-        data_type);
+void gunrock_topk(
+    GRGraph       *graph_o,
+    void          *node_ids,
+    void          *in_degrees,
+    void          *out_degrees,
+    const GRGraph *graph_i,
+    const GRSetup  config,
+    const GRTypes  data_t) {
+    dispatch_topk(graph_o, node_ids, in_degrees, out_degrees,
+                  graph_i, config, data_t);
 }
 
 // Leave this at the end of the file
diff --git a/gunrock/app/vis/vis_enactor.cuh b/gunrock/app/vis/vis_enactor.cuh
new file mode 100644
index 000000000..590863cb6
--- /dev/null
+++ b/gunrock/app/vis/vis_enactor.cuh
@@ -0,0 +1,395 @@
+// ----------------------------------------------------------------------------
+// Gunrock -- High-Performance Graph Primitives on GPU
+// ----------------------------------------------------------------------------
+// This source code is distributed under the terms of LICENSE.TXT
+// in the root directory of this source distribution.
+// ----------------------------------------------------------------------------
+
+/**
+ * @file vis_enactor.cuh
+ * @brief Primitive problem enactor for Vertex-Induced Subgraph
+ */
+
+#pragma once
+
+#include <gunrock/util/kernel_runtime_stats.cuh>
+#include <gunrock/util/test_utils.cuh>
+
+#include <gunrock/oprtr/advance/kernel.cuh>
+#include <gunrock/oprtr/advance/kernel_policy.cuh>
+#include <gunrock/oprtr/filter/kernel.cuh>
+#include <gunrock/oprtr/filter/kernel_policy.cuh>
+
+#include <gunrock/app/enactor_base.cuh>
+#include <gunrock/app/vis/vis_problem.cuh>
+#include <gunrock/app/vis/vis_functor.cuh>
+
+namespace gunrock {
+namespace app {
+namespace vis {
+
+/**
+ * @brief Primitive enactor class.
+ * @tparam INSTRUMWENT Boolean indicate collect per-CTA clock-count statistics
+ */
+template<bool INSTRUMENT>
+class VISEnactor : public EnactorBase {
+ protected:
+    /**
+     * A pinned, mapped word that the traversal kernels will signal when done
+     */
+    volatile int *done;
+    int          *d_done;
+    cudaEvent_t  throttle_event;
+
+    /**
+     * @brief Prepare the enactor for kernel call.
+     * @param[in] problem Problem object holds both graph and primitive data.
+     * \return cudaError_t object indicates the success of all CUDA functions.
+     */
+    template <typename ProblemData>
+    cudaError_t Setup(ProblemData *problem) {
+        typedef typename ProblemData::SizeT    SizeT;
+        typedef typename ProblemData::VertexId VertexId;
+
+        cudaError_t retval = cudaSuccess;
+
+        // initialize the host-mapped "done"
+        if (!done) {
+            int flags = cudaHostAllocMapped;
+
+            // allocate pinned memory for done
+            if (retval = util::GRError(
+                    cudaHostAlloc((void**)&done, sizeof(int) * 1, flags),
+                    "Enactor cudaHostAlloc done failed",
+                    __FILE__, __LINE__)) return retval;
+
+            // map done into GPU space
+            if (retval = util::GRError(
+                    cudaHostGetDevicePointer((void**)&d_done, (void*) done, 0),
+                    "Enactor cudaHostGetDevicePointer done failed",
+                    __FILE__, __LINE__)) return retval;
+
+            // create throttle event
+            if (retval = util::GRError(
+                    cudaEventCreateWithFlags(&throttle_event, cudaEventDisableTiming),
+                    "Enactor cudaEventCreateWithFlags throttle_event failed",
+                    __FILE__, __LINE__)) return retval;
+        }
+
+        done[0] = -1;
+
+        // graph slice
+        typename ProblemData::GraphSlice *graph_slice = problem->graph_slices[0];
+        // TODO: uncomment if using data_slice to store primitive-specific array
+        //typename ProblemData::DataSlice *data_slice = problem->data_slices[0];
+
+        do {
+            // bind row-offsets and bit-mask texture
+            cudaChannelFormatDesc row_offsets_desc = cudaCreateChannelDesc<SizeT>();
+            oprtr::edge_map_forward::RowOffsetTex<SizeT>::ref.channelDesc = row_offsets_desc;
+            if (retval = util::GRError(
+                    cudaBindTexture(
+                        0,
+                        oprtr::edge_map_forward::RowOffsetTex<SizeT>::ref,
+                        graph_slice->d_row_offsets,
+                        (graph_slice->nodes + 1) * sizeof(SizeT)),
+                    "Enactor cudaBindTexture row_offset_tex_ref failed",
+                    __FILE__, __LINE__)) break;
+        } while (0);
+        return retval;
+    }
+
+ public:
+    /**
+     * @brief Constructor
+     */
+    explicit VISEnactor(bool DEBUG = false) :
+        EnactorBase(EDGE_FRONTIERS, DEBUG), done(NULL), d_done(NULL) {}
+
+    /**
+     * @brief Destructor
+     */
+    virtual ~VISEnactor() {
+        if (done) {
+            util::GRError(cudaFreeHost((void*)done),
+                "Enactor FreeHost done failed", __FILE__, __LINE__);
+            util::GRError(cudaEventDestroy(throttle_event),
+                "Enactor Destroy throttle_event failed", __FILE__, __LINE__);
+        }
+    }
+
+    /**
+     * \addtogroup PublicInterface
+     * @{
+     */
+
+    /**
+     * @brief Obtain statistics the primitive enacted.
+     * @param[out] num_iterations Number of iterations (BSP super-steps).
+     */
+    template <typename VertexId>
+    void GetStatistics(VertexId &num_iterations) {
+        cudaThreadSynchronize();
+        num_iterations = enactor_stats.iteration;
+    }
+
+    /** @} */
+
+    /**
+     * @brief Enacts computing on the specified graph.
+     *
+     * @tparam AdvanceKernelPolicy Kernel policy for advance operator.
+     * @tparam FilterKernelPolicy Kernel policy for filter operator.
+     * @tparam Problem Problem type.
+     *
+     * @param[in] context CudaContext pointer for ModernGPU APIs
+     * @param[in] problem Problem object.
+     * @param[in] max_grid_size Max grid size for kernel calls.
+     *
+     * \return cudaError_t object indicates the success of all CUDA functions.
+     */
+    template <
+        typename AdvanceKernelPolicy,
+        typename FilterKernelPolicy,
+        typename Problem >
+    cudaError_t EnactVIS(
+        CudaContext & context,
+        Problem     * problem,
+        int         max_grid_size = 0) {
+        typedef typename Problem::VertexId VertexId;
+        typedef typename Problem::Value    Value;
+        typedef typename Problem::SizeT    SizeT;
+
+        typedef VISFunctor<VertexId, Value, SizeT, Problem> Functor;
+
+        cudaError_t retval = cudaSuccess;
+
+        do {
+            unsigned int *d_scanned_edges = NULL;
+
+            fflush(stdout);
+
+            // lazy initialization
+            if (retval = Setup(problem)) break;
+
+            if (retval = EnactorBase::Setup(
+                    max_grid_size,
+                    AdvanceKernelPolicy::CTA_OCCUPANCY,
+                    FilterKernelPolicy::CTA_OCCUPANCY))
+                break;
+
+            // single-gpu graph slice and data slice
+            typename Problem::GraphSlice *g_slice = problem->graph_slices[0];
+            typename Problem::DataSlice *d_slice = problem->d_data_slices[0];
+
+            if (AdvanceKernelPolicy::ADVANCE_MODE == oprtr::advance::LB) {
+                if (retval = util::GRError(
+                        cudaMalloc((void**)&d_scanned_edges,
+                        g_slice->edges * sizeof(unsigned int)),
+                        "VISProblem cudaMalloc d_scanned_edges failed",
+                        __FILE__, __LINE__)) return retval;
+            }
+
+            frontier_attribute.queue_length = g_slice->nodes;
+            frontier_attribute.queue_index  = 0;  // work queue index
+            frontier_attribute.selector     = 0;
+            frontier_attribute.queue_reset  = true;
+
+            // filter: intput all vertices in graph, output selected vertices
+            oprtr::filter::Kernel<FilterKernelPolicy, Problem, Functor>
+                <<<enactor_stats.filter_grid_size, FilterKernelPolicy::THREADS>>>(
+                enactor_stats.iteration + 1,
+                frontier_attribute.queue_reset,
+                frontier_attribute.queue_index,
+                enactor_stats.num_gpus,
+                frontier_attribute.queue_length,
+                d_done,
+                g_slice->frontier_queues.d_keys[frontier_attribute.selector],
+                NULL,
+                g_slice->frontier_queues.d_keys[frontier_attribute.selector^1],
+                d_slice,
+                NULL,
+                work_progress,
+                g_slice->frontier_elements[frontier_attribute.selector],
+                g_slice->frontier_elements[frontier_attribute.selector^1],
+                enactor_stats.filter_kernel_stats);
+
+            if (DEBUG && (retval = util::GRError(cudaThreadSynchronize(),
+                "filter::Kernel failed", __FILE__, __LINE__))) break;
+            cudaEventQuery(throttle_event);
+
+            frontier_attribute.queue_index++;
+            frontier_attribute.selector ^= 1;
+
+            if (retval = work_progress.GetQueueLength(
+                    frontier_attribute.queue_index,
+                    frontier_attribute.queue_length)) break;
+            if (DEBUG) {
+                printf("filter queue length: %lld",
+                       (long long) frontier_attribute.queue_length);
+                util::DisplayDeviceResults(
+                    problem->data_slices[0]->d_bitmask, g_slice->nodes);
+                printf("input queue for advance:\n");
+                util::DisplayDeviceResults(
+                    g_slice->frontier_queues.d_keys[frontier_attribute.selector],
+                    frontier_attribute.queue_length);
+            }
+
+        oprtr::advance::LaunchKernel<AdvanceKernelPolicy, Problem, Functor>(
+            NULL,
+            enactor_stats,
+            frontier_attribute,
+            d_slice,
+            (VertexId*)NULL,
+            (bool*)NULL,
+            (bool*)NULL,
+            d_scanned_edges,
+            g_slice->frontier_queues.d_keys[frontier_attribute.selector],
+            g_slice->frontier_queues.d_keys[frontier_attribute.selector^1],
+            (VertexId*)NULL,
+            (VertexId*)NULL,
+            g_slice->d_row_offsets,
+            g_slice->d_column_indices,
+            (SizeT*)NULL,
+            (VertexId*)NULL,
+            g_slice->nodes,
+            g_slice->edges,
+            this->work_progress,
+            context,
+            gunrock::oprtr::advance::V2V);
+
+        if (DEBUG && (retval = util::GRError(cudaThreadSynchronize(),
+            "advance::Kernel failed", __FILE__, __LINE__))) break;
+        cudaEventQuery(throttle_event);
+
+        frontier_attribute.queue_index++;
+
+        if (DEBUG) {
+            if (retval = work_progress.GetQueueLength(
+                    frontier_attribute.queue_index,
+                    frontier_attribute.queue_length)) break;
+            printf("advance queue length: %lld",
+                   (long long) frontier_attribute.queue_length);
+            util::DisplayDeviceResults(
+                    g_slice->frontier_queues.d_keys[frontier_attribute.selector^1],
+                    frontier_attribute.queue_length);
+        }
+
+        // TODO: extract graph with proper format (edge list, csr, etc.)
+
+        if (d_scanned_edges) cudaFree(d_scanned_edges);
+
+        } while (0);
+
+        if (DEBUG) {
+            printf("\nGPU Vertex-Induced Subgraph Enact Done.\n");
+        }
+
+        return retval;
+    }
+
+    /**
+     * \addtogroup PublicInterface
+     * @{
+     */
+
+    /**
+     * @brief Primitive enact kernel entry.
+     *
+     * @tparam Problem Problem type. @see Problem
+     *
+     * @param[in] context CudaContext pointer for ModernGPU APIs
+     * @param[in] problem Pointer to Problem object.
+     * @param[in] max_grid_size Max grid size for kernel calls.
+     * @param[in] traversal_mode Traversal Mode for advance operator:
+     *            Load-balanced or Dynamic cooperative
+     *
+     * \return cudaError_t object indicates the success of all CUDA functions.
+     */
+    template <typename Problem>
+    cudaError_t Enact(
+        CudaContext &context,
+        Problem     *problem,
+        int         max_grid_size  = 0,
+        int         traversal_mode = 0) {
+        if (this->cuda_props.device_sm_version >= 300) {
+            typedef oprtr::filter::KernelPolicy <
+                Problem,             // Problem data type
+                300,                 // CUDA_ARCH
+                INSTRUMENT,          // INSTRUMENT
+                0,                   // SATURATION QUIT
+                true,                // DEQUEUE_PROBLEM_SIZE
+                8,                   // MIN_CTA_OCCUPANCY
+                8,                   // LOG_THREADS
+                1,                   // LOG_LOAD_VEC_SIZE
+                0,                   // LOG_LOADS_PER_TILE
+                5,                   // LOG_RAKING_THREADS
+                5,                   // END_BITMASK_CULL
+                8 >                  // LOG_SCHEDULE_GRANULARITY
+                FilterKernelPolicy;
+
+            typedef oprtr::advance::KernelPolicy <
+                Problem,             // Problem data type
+                300,                 // CUDA_ARCH
+                INSTRUMENT,          // INSTRUMENT
+                1,                   // MIN_CTA_OCCUPANCY
+                7,                   // LOG_THREADS
+                8,                   // LOG_BLOCKS
+                32 * 128,            // LIGHT_EDGE_THRESHOLD (used for LB)
+                1,                   // LOG_LOAD_VEC_SIZE
+                0,                   // LOG_LOADS_PER_TILE
+                5,                   // LOG_RAKING_THREADS
+                32,                  // WARP_GATHER_THRESHOLD
+                128 * 4,             // CTA_GATHER_THRESHOLD
+                7,                   // LOG_SCHEDULE_GRANULARITY
+                oprtr::advance::TWC_FORWARD >
+                ForwardAdvanceKernelPolicy;
+
+            typedef oprtr::advance::KernelPolicy <
+                Problem,             // Problem data type
+                300,                 // CUDA_ARCH
+                INSTRUMENT,          // INSTRUMENT
+                1,                   // MIN_CTA_OCCUPANCY
+                10,                  // LOG_THREADS
+                8,                   // LOG_BLOCKS
+                32 * 128,            // LIGHT_EDGE_THRESHOLD (used for LB)
+                1,                   // LOG_LOAD_VEC_SIZE
+                0,                   // LOG_LOADS_PER_TILE
+                5,                   // LOG_RAKING_THREADS
+                32,                  // WARP_GATHER_THRESHOLD
+                128 * 4,             // CTA_GATHER_THRESHOLD
+                7,                   // LOG_SCHEDULE_GRANULARITY
+                oprtr::advance::LB >
+                LBAdvanceKernelPolicy;
+
+            if (traversal_mode == 0) {
+                return EnactVIS<
+                    LBAdvanceKernelPolicy, FilterKernelPolicy, Problem>(
+                        context, problem, max_grid_size);
+            } else {  // traversal_mode == 1
+                return EnactVIS<
+                    ForwardAdvanceKernelPolicy, FilterKernelPolicy, Problem>(
+                        context, problem, max_grid_size);
+            }
+        }
+
+        // to reduce compile time, get rid of other architecture for now
+        // TODO: add all the kernel policy setting for all architectures
+
+        printf("Not yet tuned for this architecture\n");
+        return cudaErrorInvalidDeviceFunction;
+    }
+
+    /** @} */
+};
+
+}  // namespace vis
+}  // namespace app
+}  // namespace gunrock
+
+// Leave this at the end of the file
+// Local Variables:
+// mode:c++
+// c-file-style: "NVIDIA"
+// End:
diff --git a/gunrock/app/vis/vis_functor.cuh b/gunrock/app/vis/vis_functor.cuh
new file mode 100644
index 000000000..7611d42d0
--- /dev/null
+++ b/gunrock/app/vis/vis_functor.cuh
@@ -0,0 +1,108 @@
+// ----------------------------------------------------------------------------
+// Gunrock -- High-Performance Graph Primitives on GPU
+// ----------------------------------------------------------------------------
+// This source code is distributed under the terms of LICENSE.TXT
+// in the root directory of this source distribution.
+// ----------------------------------------------------------------------------
+
+/**
+ * @file vis_functor.cuh
+ * @brief Device functions for Vertex-Induced Subgraph
+ */
+
+#pragma once
+
+#include <gunrock/app/problem_base.cuh>
+#include <gunrock/app/vis/vis_problem.cuh>
+
+namespace gunrock {
+namespace app {
+namespace vis {
+
+/**
+ * @brief Structure contains device functions
+ *
+ * @tparam VertexId    Type used for vertex id (e.g., uint32)
+ * @tparam SizeT       Type used for array indexing. (e.g., uint32)
+ * @tparam Value       Type used for calculation values (e.g., float)
+ * @tparam ProblemData Problem data type which contains data slice
+ *
+ */
+template<typename VertexId, typename SizeT,
+         typename Value, typename ProblemData>
+struct VISFunctor {
+    typedef typename ProblemData::DataSlice DataSlice;
+
+    /**
+     * @brief Advance condition function
+     *
+     * @param[in] s_id Vertex Id of the edge source node
+     * @param[in] d_id Vertex Id of the edge destination node
+     * @param[in] problem Data slice object
+     * @param[in] e_id Output edge id
+     * @param[in] e_id_in Input edge id
+     *
+     * \return Whether to load the apply function for the edge and
+     *         include the destination node in the next frontier.
+     */
+    static __device__ __forceinline__ bool
+    CondEdge(VertexId s_id, VertexId d_id, DataSlice *problem,
+             VertexId e_id = 0, VertexId e_id_in = 0) {
+        return problem->d_bitmask[d_id];
+    }
+
+    /**
+     * @brief Advance apply function
+     *
+     * @param[in] s_id Vertex Id of the edge source node
+     * @param[in] d_id Vertex Id of the edge destination node
+     * @param[in] problem Data slice object
+     * @param[in] e_id Output edge id
+     * @param[in] e_id_in Input edge id
+     *
+     */
+    static __device__ __forceinline__ void
+    ApplyEdge(VertexId s_id, VertexId d_id, DataSlice *problem,
+              VertexId e_id = 0, VertexId e_id_in = 0) {
+        printf("select edges: sid: %d, did: %d, eid: %d\n", s_id, d_id, e_id);
+    }
+
+    /**
+     * @brief filter condition function
+     *
+     * @param[in] node Vertex Id
+     * @param[in] problem Data slice object
+     * @param[in] v Auxiliary value
+     *
+     * \return Whether to load the apply function for the node and
+     *         include it in the outgoing vertex frontier.
+     */
+    static __device__ __forceinline__ bool
+    CondFilter(VertexId node, DataSlice *problem, Value v = 0, SizeT nid = 0) {
+        return (node % 2) == 0;  // TODO: USER-DEFINED FILTER CONDITION HERE
+    }
+
+    /**
+     * @brief filter apply function
+     *
+     * @param[in] node Vertex Id
+     * @param[in] problem Data slice object
+     * @param[in] v Auxiliary value
+     *
+     */
+    static __device__ __forceinline__ void
+    ApplyFilter(VertexId node, DataSlice *problem, Value v = 0, SizeT nid = 0) {
+        util::io::ModifiedStore<ProblemData::QUEUE_WRITE_MODIFIER>::St(
+            true, problem->d_bitmask + node);
+    }
+};
+
+}  // namespace vis
+}  // namespace app
+}  // namespace gunrock
+
+// Leave this at the end of the file
+// Local Variables:
+// mode:c++
+// c-file-style: "NVIDIA"
+// End:
diff --git a/gunrock/app/vis/vis_problem.cuh b/gunrock/app/vis/vis_problem.cuh
new file mode 100644
index 000000000..85519391b
--- /dev/null
+++ b/gunrock/app/vis/vis_problem.cuh
@@ -0,0 +1,294 @@
+// ----------------------------------------------------------------------------
+// Gunrock -- High-Performance Graph Primitives on GPU
+// ----------------------------------------------------------------------------
+// This source code is distributed under the terms of LICENSE.TXT
+// in the root directory of this source distribution.
+// ----------------------------------------------------------------------------
+
+/**
+ * @file vis_problem.cuh
+ * @brief GPU storage management structure for Vertex-Induced Subgraph
+ */
+
+#pragma once
+
+#include <gunrock/app/problem_base.cuh>
+#include <gunrock/util/memset_kernel.cuh>
+
+namespace gunrock {
+namespace app {
+namespace vis {
+
+/**
+ * @brief Problem structure stores device-side vectors
+ * @tparam _VertexId Type use as vertex id (e.g., uint32)
+ * @tparam _SizeT    Type use for array indexing. (e.g., uint32)
+ * @tparam _Value    Type use for computed value.
+ */
+template<typename _VertexId, typename _SizeT, typename _Value>
+struct VISProblem : ProblemBase<_VertexId, _SizeT, false> {
+    typedef _VertexId VertexId;
+    typedef _SizeT    SizeT;
+    typedef _Value    Value;
+
+    static const bool MARK_PREDECESSORS  = true;
+    static const bool ENABLE_IDEMPOTENCE = false;
+
+    /**
+     * @brief Data slice structure which contains problem specific data.
+     */
+    struct DataSlice {
+        // device storage arrays
+        VertexId *d_labels;   // used for ...
+        bool     *d_bitmask;  // used for indicating if vertex is in subgraph
+    };
+
+    int       num_gpus;
+    SizeT     nodes;
+    SizeT     edges;
+
+    // data slices (one for each GPU)
+    DataSlice **data_slices;
+
+    // putting structure on device while keeping the SoA structure
+    DataSlice **d_data_slices;
+
+    // device index for each data slice
+    int       *gpu_idx;
+
+    /**
+     * @brief Default constructor
+     */
+    VISProblem(): nodes(0), edges(0), num_gpus(0) {}
+
+    /**
+     * @brief Constructor
+     * @param[in] stream_from_host Whether to stream data from host.
+     * @param[in] graph Reference to the CSR graph object we process on.
+     * @param[in] num_gpus Number of the GPUs used.
+     */
+    VISProblem(bool  stream_from_host,  // only meaningful for single-GPU
+                  const Csr<VertexId, Value, SizeT> &graph,
+                  int   num_gpus) :
+        num_gpus(num_gpus) {
+        Init(stream_from_host, graph, num_gpus);
+    }
+
+    /**
+     * @brief Default destructor
+     */
+    ~VISProblem() {
+        for (int i = 0; i < num_gpus; ++i) {
+            if (util::GRError(
+                cudaSetDevice(gpu_idx[i]),
+                "~Problem cudaSetDevice failed", __FILE__, __LINE__)) break;
+
+            if (data_slices[i]->d_labels)
+                util::GRError(cudaFree(data_slices[i]->d_labels),
+                    "GpuSlice cudaFree d_labels failed", __FILE__, __LINE__);
+
+            if (data_slices[i]->d_bitmask)
+                util::GRError(cudaFree(data_slices[i]->d_bitmask),
+                    "DataSlice cudaFree d_bitmask failed", __FILE__, __LINE__);
+
+            if (d_data_slices[i])
+                util::GRError(cudaFree(d_data_slices[i]),
+                    "GpuSlice cudaFree data_slices failed", __FILE__, __LINE__);
+        }
+        if (d_data_slices) delete[] d_data_slices;
+        if (data_slices)   delete[]   data_slices;
+    }
+
+    /**
+     * \addtogroup PublicInterface
+     * @{
+     */
+
+    /**
+     * @brief Copy results computed on the GPU back to host-side vectors.
+     * @param[out] h_labels
+     *\return cudaError_t object indicates the success of all CUDA functions.
+     */
+    cudaError_t Extract(VertexId *h_labels) {
+        cudaError_t retval = cudaSuccess;
+
+        do {
+            if (num_gpus == 1) {
+                if (util::GRError(cudaSetDevice(gpu_idx[0]),
+                                  "Problem cudaSetDevice failed",
+                                  __FILE__, __LINE__)) break;
+
+                if (retval = util::GRError(
+                        cudaMemcpy(h_labels,
+                                   data_slices[0]->d_labels,
+                                   sizeof(VertexId) * nodes,
+                                   cudaMemcpyDeviceToHost),
+                        "Problem cudaMemcpy d_labels failed",
+                        __FILE__, __LINE__)) break;
+
+                // TODO: code to extract other results here
+
+            } else {
+                // multi-GPU extension code
+            }
+        } while (0);
+
+        return retval;
+    }
+
+    /**
+     * @brief Problem initialization
+     *
+     * @param[in] stream_from_host Whether to stream data from host.
+     * @param[in] graph Reference to the CSR graph object we process on.
+     * @param[in] _num_gpus Number of the GPUs used.
+     *
+     * \return cudaError_t object indicates the success of all CUDA functions.
+     */
+    cudaError_t Init(
+        bool  stream_from_host,  // only meaningful for single-GPU
+        const Csr<VertexId, Value, SizeT> &graph,
+        int   _num_gpus) {
+        num_gpus = _num_gpus;
+        nodes    = graph.nodes;
+        edges    = graph.edges;
+        VertexId *h_row_offsets    = graph.row_offsets;
+        VertexId *h_column_indices = graph.column_indices;
+
+        ProblemBase<_VertexId, _SizeT, false>::Init(
+            stream_from_host,
+            nodes,
+            edges,
+            h_row_offsets,
+            h_column_indices,
+            NULL,
+            NULL,
+            num_gpus);
+
+        // no data in DataSlice needs to be copied from host
+
+        /**
+         * Allocate output labels
+         */
+        cudaError_t retval = cudaSuccess;
+        data_slices   = new DataSlice * [num_gpus];
+        d_data_slices = new DataSlice * [num_gpus];
+
+        do {
+            if (num_gpus <= 1) {
+                gpu_idx = (int*)malloc(sizeof(int));
+
+                // create a single data slice for the currently-set GPU
+                int gpu;
+                if (retval = util::GRError(
+                    cudaGetDevice(&gpu), "Problem cudaGetDevice failed",
+                    __FILE__, __LINE__)) break;
+                gpu_idx[0] = gpu;
+
+                data_slices[0] = new DataSlice;
+                if (retval = util::GRError(
+                    cudaMalloc((void**)&d_data_slices[0], sizeof(DataSlice)),
+                    "Problem cudaMalloc d_data_slices failed",
+                    __FILE__, __LINE__)) return retval;
+
+                // create SoA on device
+                VertexId *d_labels;
+                if (retval = util::GRError(
+                    cudaMalloc((void**)&d_labels, nodes * sizeof(VertexId)),
+                    "Problem cudaMalloc d_labels failed",
+                    __FILE__, __LINE__)) return retval;
+                data_slices[0]->d_labels = d_labels;
+
+                bool *d_bitmask;
+                if (retval = util::GRError(
+                    cudaMalloc((void**)&d_bitmask, nodes * sizeof(bool)),
+                    "Problem cudaMalloc d_bitmask failed",
+                    __FILE__, __LINE__)) return retval;
+                data_slices[0]->d_bitmask = d_bitmask;
+                util::MemsetKernel<<<128, 128>>>(
+                   data_slices[0]->d_bitmask, (bool)false, nodes);
+            }
+            // add multi-GPU allocation code
+        } while (0);
+
+        return retval;
+    }
+
+    /**
+     *  @brief Performs any initialization work needed for primitive
+     *  @param[in] frontier_type Frontier type (i.e., edge / vertex / mixed)
+     *  @param[in] queue_sizing Size scaling factor for work queue allocation
+     *  \return cudaError_t object indicates the success of all CUDA functions.
+     */
+    cudaError_t Reset(
+        FrontierType frontier_type,  // type (i.e., edge / vertex / mixed)
+        double queue_sizing) {
+        // size scaling factor for work queue allocation (e.g., 1.0 creates
+        // n-element and m-element vertex and edge frontiers, respectively).
+        // 0.0 is unspecified.
+
+        typedef ProblemBase<_VertexId, _SizeT, false> BaseProblem;
+
+        // load ProblemBase Reset
+        BaseProblem::Reset(frontier_type, queue_sizing);
+
+        cudaError_t retval = cudaSuccess;
+
+        for (int gpu = 0; gpu < num_gpus; ++gpu) {
+            // setting device
+            if (retval = util::GRError(
+                    cudaSetDevice(gpu_idx[gpu]),
+                    "Problem cudaSetDevice failed",
+                    __FILE__, __LINE__)) return retval;
+
+            // allocate output labels if necessary
+            if (!data_slices[gpu]->d_labels) {
+                VertexId *d_labels;
+                if (retval = util::GRError(
+                        cudaMalloc((void**)&d_labels, nodes * sizeof(VertexId)),
+                        "Problem cudaMalloc d_labels failed",
+                        __FILE__, __LINE__)) return retval;
+                data_slices[gpu]->d_labels = d_labels;
+            }
+
+            util::MemsetKernel<<< 128, 128>>>(
+                data_slices[gpu]->d_labels, -1, nodes);
+
+            if (!data_slices[gpu]->d_bitmask) {
+                bool *d_bitmask;
+                if (retval = util::GRError(cudaMalloc(
+                    (void**)&d_bitmask, nodes * sizeof(bool)),
+                    "MSTProblem cudaMalloc d_temp_value Failed",
+                    __FILE__, __LINE__)) return retval;
+                data_slices[gpu]->d_bitmask = d_bitmask;
+            }
+
+            if (retval = util::GRError(
+                    cudaMemcpy(d_data_slices[gpu],
+                               data_slices[gpu],
+                               sizeof(DataSlice),
+                               cudaMemcpyHostToDevice),
+                    "Problem cudaMemcpy data_slices to d_data_slices failed",
+                    __FILE__, __LINE__)) return retval;
+        }
+
+        // TODO: fill in the initial input_queue for problem
+        // e.g., put every vertex in frontier queue
+        util::MemsetIdxKernel<<<128, 128>>>(
+            BaseProblem::graph_slices[0]->frontier_queues.d_keys[0], nodes);
+
+        return retval;
+    }
+
+    /** @} */
+};
+
+}  // namespace vis
+}  // namespace app
+}  // namespace gunrock
+
+// Leave this at the end of the file
+// Local Variables:
+// mode:c++
+// c-file-style: "NVIDIA"
+// End:
diff --git a/gunrock/coo.cuh b/gunrock/coo.cuh
index e6b585a4c..008dec22b 100644
--- a/gunrock/coo.cuh
+++ b/gunrock/coo.cuh
@@ -37,8 +37,7 @@ struct Coo {
 
     Coo(VertexId row, VertexId col, Value val) : row(row), col(col), val(val) {}
 
-    void Val(Value &value)
-    {
+    void Val(Value &value) {
         value = val;
     }
 };
@@ -71,8 +70,7 @@ struct Coo<VertexId, util::NullType> {
 template<typename Coo>
 bool RowFirstTupleCompare (
     Coo elem1,
-    Coo elem2)
-{
+    Coo elem2) {
     if (elem1.row < elem2.row) {
         // Sort edges by source node
         return true;
@@ -97,8 +95,7 @@ bool RowFirstTupleCompare (
 template<typename Coo>
 bool ColumnFirstTupleCompare (
     Coo elem1,
-    Coo elem2)
-{
+    Coo elem2) {
     if (elem1.col < elem2.col) {
         // Sort edges by source node
         return true;
diff --git a/gunrock/csr.cuh b/gunrock/csr.cuh
index e73b2d5cc..3c56f2f1d 100644
--- a/gunrock/csr.cuh
+++ b/gunrock/csr.cuh
@@ -37,8 +37,7 @@ namespace gunrock {
  * the graph as a sparse matrix.
  */
 template<typename VertexId, typename Value, typename SizeT>
-struct Csr
-{
+struct Csr {
     SizeT nodes;     /**< Number of nodes in the graph. */
     SizeT edges;     /**< Number of edges in the graph. */
     SizeT out_nodes; /**< Number of nodes which have outgoing edges. */
@@ -60,8 +59,7 @@ struct Csr
      * @param[in] pinned Use pinned memory for CSR data structure
      * (default: do not use pinned memory)
      */
-    Csr(bool pinned = false)
-    {
+    Csr(bool pinned = false) {
         nodes = 0;
         edges = 0;
         average_degree = 0;
@@ -82,8 +80,7 @@ struct Csr
      * @param[in] edges Number of edges in COO-format graph
      */
     template <bool LOAD_EDGE_VALUES, bool LOAD_NODE_VALUES>
-    void FromScratch(SizeT nodes, SizeT edges)
-    {
+    void FromScratch(SizeT nodes, SizeT edges) {
         this->nodes = nodes;
         this->edges = edges;
 
@@ -92,32 +89,32 @@ struct Csr
             // Put our graph in pinned memory
             int flags = cudaHostAllocMapped;
             if (gunrock::util::GRError(
-                    cudaHostAlloc((void **)&row_offsets,
-                                  sizeof(SizeT) * (nodes + 1), flags),
-                    "Csr cudaHostAlloc row_offsets failed", __FILE__, __LINE__))
+                        cudaHostAlloc((void **)&row_offsets,
+                                      sizeof(SizeT) * (nodes + 1), flags),
+                        "Csr cudaHostAlloc row_offsets failed", __FILE__, __LINE__))
                 exit(1);
             if (gunrock::util::GRError(
-                    cudaHostAlloc((void **)&column_indices,
-                                  sizeof(VertexId) * edges, flags),
-                    "Csr cudaHostAlloc column_indices failed",
-                    __FILE__, __LINE__))
+                        cudaHostAlloc((void **)&column_indices,
+                                      sizeof(VertexId) * edges, flags),
+                        "Csr cudaHostAlloc column_indices failed",
+                        __FILE__, __LINE__))
                 exit(1);
 
             if (LOAD_NODE_VALUES) {
                 if (gunrock::util::GRError(
-                        cudaHostAlloc((void **)&node_values,
-                                      sizeof(Value) * nodes, flags),
-                        "Csr cudaHostAlloc node_values failed",
-                        __FILE__, __LINE__))
+                            cudaHostAlloc((void **)&node_values,
+                                          sizeof(Value) * nodes, flags),
+                            "Csr cudaHostAlloc node_values failed",
+                            __FILE__, __LINE__))
                     exit(1);
             }
 
             if (LOAD_EDGE_VALUES) {
                 if (gunrock::util::GRError(
-                        cudaHostAlloc((void **)&edge_values,
-                                      sizeof(Value) * edges, flags),
-                        "Csr cudaHostAlloc edge_values failed",
-                        __FILE__, __LINE__))
+                            cudaHostAlloc((void **)&edge_values,
+                                          sizeof(Value) * edges, flags),
+                            "Csr cudaHostAlloc edge_values failed",
+                            __FILE__, __LINE__))
                     exit(1);
             }
 
@@ -127,51 +124,52 @@ struct Csr
             row_offsets = (SizeT*) malloc(sizeof(SizeT) * (nodes + 1));
             column_indices = (VertexId*) malloc(sizeof(VertexId) * edges);
             node_values = (LOAD_NODE_VALUES) ?
-                (Value*) malloc(sizeof(Value) * nodes) : NULL;
+                          (Value*) malloc(sizeof(Value) * nodes) : NULL;
             edge_values = (LOAD_EDGE_VALUES) ?
-                (Value*) malloc(sizeof(Value) * edges) : NULL;
+                          (Value*) malloc(sizeof(Value) * edges) : NULL;
         }
     }
 
     /**
      *
-     * @brief Store graph information into files
+     * @brief Store graph information into a file
      *
      */
-    void WriteToFile(
-        char     *file_name,
-        bool     undirected,
-        bool     reversed,
-        SizeT    num_nodes,
-        SizeT    num_edges,
-        SizeT    *row_offsets,
-        VertexId *col_indices,
-        Value    *edge_values = NULL)
-    {
-        printf("==> Writing into file:  %s\n", file_name);
-        time_t mark1 = time(NULL);
-
-        std::ofstream output(file_name);
-        if (output.is_open())
-        {
-            output << num_nodes << " " << num_edges << " ";
-            std::copy(row_offsets, row_offsets + num_nodes + 1,
-                      std::ostream_iterator<SizeT>(output, " "));
-            std::copy(column_indices, column_indices + num_edges,
-                      std::ostream_iterator<VertexId>(output, " "));
-            if (edge_values != NULL)
-            {
-                std::copy(edge_values, edge_values + num_edges,
-                          std::ostream_iterator<Value>(output, " "));
+    void WriteToFile(char  *file_name, SizeT v, SizeT e, SizeT *row,
+                     VertexId *col, Value *edge_values = NULL) {
+        std::ofstream fout(file_name);
+        if (fout.is_open()) {
+            fout.write(reinterpret_cast<const char*>(&v), sizeof(SizeT));
+            fout.write(reinterpret_cast<const char*>(&e), sizeof(SizeT));
+            fout.write(reinterpret_cast<const char*>(row), (v+1)*sizeof(SizeT));
+            fout.write(reinterpret_cast<const char*>(col), e*sizeof(VertexId));
+            if (edge_values != NULL) {
+                fout.write(reinterpret_cast<const char*>(edge_values),
+                           e * sizeof(Value));
             }
-            output.close();
-        } else
-        {
-          std::cout << "Cannot Open The File." << std::endl;
+            fout.close();
         }
+    }
 
-        time_t mark2 = time(NULL);
-        printf("Finished writing in %ds.\n", (int)(mark2 - mark1));
+    void WriteToLigraFile(char  *file_name, SizeT v, SizeT e, SizeT *row,
+                     VertexId *col, Value *edge_values = NULL) {
+        char adj_name[256];
+        sprintf(adj_name, "%s.adj", file_name);
+        printf("writing to ligra .adj file.\n");
+
+        std::ofstream fout3(adj_name);
+        if (fout3.is_open()) {
+            fout3 << v << " " << v << " " << e << std::endl;
+            for (int i = 0; i < v; ++i)
+                fout3 << row[i] << std::endl;
+            for (int i = 0; i < e; ++i)
+                fout3 << col[i] << std::endl;
+            if (edge_values != NULL) {
+                for (int i = 0; i < e; ++i)
+                    fout3 << edge_values[i] << std::endl;
+            }
+            fout3.close();
+        }
     }
 
     /**
@@ -180,63 +178,36 @@ struct Csr
      *
      */
     template <bool LOAD_EDGE_VALUES>
-    void FromCsr(char *f_in, bool undirected, bool reversed)
-    {
-        printf("  Reading directly from previously stored CSR arrays ...\n");
-
-        std::ifstream _file;
-        char buf[65536];
-        _file.rdbuf()->pubsetbuf(buf,65536);
-        _file.open(f_in);
-
-        if (_file.is_open())
-        {
-            time_t mark1 = time(NULL);
-
-            std::istream_iterator<int> start(_file), end;
-            std::vector<int> v(start, end);
-
-            SizeT csr_nodes = v.at(0);
-            SizeT csr_edges = v.at(1);
-            printf("#nodes = %lld, #edges = %lld, #v = %lld\n", (long long)csr_nodes, (long long)csr_edges, (long long)v.size());
-
-            FromScratch<LOAD_EDGE_VALUES, false>(csr_nodes, csr_edges);
+    void FromCsr(char *f_in) {
+        printf("  Reading directly from stored binary CSR arrays ...\n");
+        time_t mark1 = time(NULL);
 
-            std::copy(v.begin() + 2, v.begin() + 3 + csr_nodes, row_offsets);
-            std::copy(v.begin() + 3 + csr_nodes,
-                      v.begin() + 3 + csr_nodes + csr_edges,
-                      column_indices);
-            if(LOAD_EDGE_VALUES)
-            {
-                std::copy(v.begin() + 3 + csr_nodes + csr_edges,
-                          v.end(), edge_values);
-            }
+        std::ifstream input(f_in);
+        SizeT v, e;
+        input.read(reinterpret_cast<char*>(&v), sizeof(SizeT));
+        input.read(reinterpret_cast<char*>(&e), sizeof(SizeT));
 
-            time_t mark2 = time(NULL);
-            printf("Done reading (%ds).\n", (int) (mark2 - mark1));
+        FromScratch<LOAD_EDGE_VALUES, false>(v, e);
 
-            v.clear();
-        }
-        else
-        {
-            perror("Unable To Open The File.");
+        input.read(reinterpret_cast<char*>(row_offsets), (v + 1)*sizeof(SizeT));
+        input.read(reinterpret_cast<char*>(column_indices), e*sizeof(VertexId));
+        if (LOAD_EDGE_VALUES) {
+            input.read(reinterpret_cast<char*>(edge_values), e*sizeof(Value));
         }
 
+        time_t mark2 = time(NULL);
+        printf("Done reading (%ds).\n", (int) (mark2 - mark1));
+
         // compute out_nodes
         SizeT out_node = 0;
-        for (SizeT node = 0; node < nodes; node++)
-        {
-            if (row_offsets[node+1] - row_offsets[node] > 0)
-            {
+        for (SizeT node = 0; node < nodes; node++) {
+            if (row_offsets[node + 1] - row_offsets[node] > 0) {
                 ++out_node;
             }
         }
         out_nodes = out_node;
-
-        fflush(stdout);
     }
 
-
     /**
      * @brief Build CSR graph from COO graph, sorted or unsorted
      *
@@ -366,33 +337,20 @@ struct Csr
         printf("Done converting (%ds).\n", (int)(mark2 - mark1));
 
         // Write offsets, indices, node, edges etc. into file
-        if (LOAD_EDGE_VALUES)
-	    {
-	        WriteToFile(output_file, 
-		      undirected, 
-		      reversed, 
-		      nodes, 
-		      edges, 
-		      row_offsets, 
-		      column_indices, 
-		      edge_values);
+        if (LOAD_EDGE_VALUES) {
+            WriteToFile(output_file, nodes, edges,
+                        row_offsets, column_indices, edge_values);
+            //WriteToLigraFile(output_file, nodes, edges,
+            //            row_offsets, column_indices, edge_values);
         } else {
-	        WriteToFile(output_file, 
-		      undirected, 
-		      reversed,
-		      nodes, 
-		      edges, 
-		      row_offsets, 
-		      column_indices);
+            WriteToFile(output_file, nodes, edges,
+                        row_offsets, column_indices);
         }
 
-        fflush(stdout);
-
         // Compute out_nodes
         SizeT out_node = 0;
         for (SizeT node = 0; node < nodes; node++) {
-            if (row_offsets[node+1] - row_offsets[node] > 0)
-            {
+            if (row_offsets[node + 1] - row_offsets[node] > 0) {
                 ++out_node;
             }
         }
@@ -407,8 +365,7 @@ struct Csr
     /**
      * @brief Print log-scale degree histogram of the graph.
      */
-    void PrintHistogram()
-    {
+    void PrintHistogram() {
         fflush(stdout);
 
         // Initialize
@@ -436,7 +393,6 @@ struct Csr
         }
         printf("\nDegree Histogram (%lld vertices, %lld edges):\n",
                (long long) nodes, (long long) edges);
-
         printf("    Degree   0: %d (%.2f%%)\n", log_counts[0],
                (float) log_counts[0] * 100.0 / nodes);
         for (int i = 0; i < max_log_length + 1; i++) {
@@ -451,9 +407,8 @@ struct Csr
     /**
      * @brief Display CSR graph to console
      */
-    void DisplayGraph(bool with_edge_value = false)
-    {
-        SizeT displayed_node_num = (nodes > 40) ? 40:nodes;
+    void DisplayGraph(bool with_edge_value = false) {
+        SizeT displayed_node_num = (nodes > 40) ? 40 : nodes;
         printf("First %d nodes's neighbor list of the input graph:\n",
                displayed_node_num);
         for (SizeT node = 0; node < displayed_node_num; node++) {
@@ -465,7 +420,7 @@ struct Csr
                 if (edge - row_offsets[node] > 40) break;
                  printf("[");
                 util::PrintValue(column_indices[edge]);
-                if (with_edge_value) {
+                if (with_edge_value && edge_values != NULL) {
                     printf(",");
                     util::PrintValue(edge_values[edge]);
                 }
@@ -545,19 +500,19 @@ struct Csr
     {
         for (SizeT node = 0; node < nodes; ++node) {
             for (SizeT edge = row_offsets[node];
-                 edge < row_offsets[node+1];
-                 ++edge) {
-                 int src_node = node;
-                 int dst_node = column_indices[edge];
-                 int edge_value = edge_values[edge];
-                 for (SizeT r_edge = row_offsets[dst_node];
-                 r_edge < row_offsets[dst_node+1];
-                 ++r_edge) {
+                    edge < row_offsets[node + 1];
+                    ++edge) {
+                int src_node = node;
+                int dst_node = column_indices[edge];
+                int edge_value = edge_values[edge];
+                for (SizeT r_edge = row_offsets[dst_node];
+                        r_edge < row_offsets[dst_node + 1];
+                        ++r_edge) {
                     if (column_indices[r_edge] == src_node) {
                         if (edge_values[r_edge] != edge_value)
                             return false;
                     }
-                 }
+                }
             }
         }
         return true;
@@ -566,14 +521,12 @@ struct Csr
     /**
      * @brief Find node with largest neighbor list
      */
-    int GetNodeWithHighestDegree(int& max_degree)
-    {
+    int GetNodeWithHighestDegree(int& max_degree) {
         int degree = 0;
         int src = 0;
         for (SizeT node = 0; node < nodes; node++) {
-            if (row_offsets[node+1] - row_offsets[node] > degree)
-            {
-                degree = row_offsets[node+1]-row_offsets[node];
+            if (row_offsets[node + 1] - row_offsets[node] > degree) {
+                degree = row_offsets[node + 1] - row_offsets[node];
                 src = node;
             }
         }
@@ -584,16 +537,15 @@ struct Csr
     /**
      * @brief Display the neighbor list of a given node
      */
-    void DisplayNeighborList(VertexId node)
-    {
+    void DisplayNeighborList(VertexId node) {
         if (node < 0 || node >= nodes) return;
         for (SizeT edge = row_offsets[node];
-                 edge < row_offsets[node + 1];
-                 edge++) {
-                util::PrintValue(column_indices[edge]);
-                printf(", ");
-            }
-            printf("\n");
+                edge < row_offsets[node + 1];
+                edge++) {
+            util::PrintValue(column_indices[edge]);
+            printf(", ");
+        }
+        printf("\n");
     }
 
     /**
@@ -604,7 +556,7 @@ struct Csr
             double mean = 0, count = 0;
             for (SizeT node = 0; node < nodes; ++node) {
                 count += 1;
-                mean += (row_offsets[node+1]- row_offsets[node] - mean) / count;
+                mean += (row_offsets[node+1]-row_offsets[node]-mean)/count;
             }
             average_degree = static_cast<SizeT>(mean);
         }
@@ -650,8 +602,7 @@ struct Csr
     /**
      * @brief Deallocates CSR graph
      */
-    void Free()
-    {
+    void Free() {
         if (row_offsets) {
             if (pinned) {
                 gunrock::util::GRError(cudaFreeHost(row_offsets),
@@ -682,8 +633,7 @@ struct Csr
     /**
      * @brief CSR destructor
      */
-    ~Csr()
-    {
+    ~Csr() {
         Free();
     }
 };
diff --git a/gunrock/graphio/market.cuh b/gunrock/graphio/market.cuh
index 3795d3bca..12c9a3235 100644
--- a/gunrock/graphio/market.cuh
+++ b/gunrock/graphio/market.cuh
@@ -56,8 +56,7 @@ int ReadMarketStream(
     char *output_file,
     Csr<VertexId, Value, SizeT> &csr_graph,
     bool undirected,
-    bool reversed)
-{
+    bool reversed) {
     typedef Coo<VertexId, Value> EdgeTupleType;
 
     SizeT edges_read = -1;
@@ -73,7 +72,7 @@ int ReadMarketStream(
 
     bool ordered_rows = true;
 
-    while(true) {
+    while (true) {
 
         if (fscanf(f_in, "%[^\n]\n", line) <= 0) {
             break;
@@ -110,7 +109,7 @@ int ReadMarketStream(
             fflush(stdout);
 
             // Allocate coo graph
-            coo = (EdgeTupleType*) malloc(sizeof(EdgeTupleType) * edges);
+            coo = (EdgeTupleType*)malloc(sizeof(EdgeTupleType) * edges);
 
             edges_read++;
 
@@ -122,26 +121,27 @@ int ReadMarketStream(
                 return -1;
             }
             if (edges_read >= edges) {
-              fprintf(stderr,
-                      "Error parsing MARKET graph:"
-                      "encountered more than %d edges\n",
-                      edges);
-              if (coo) free(coo);
-              return -1;
+                fprintf(stderr,
+                        "Error parsing MARKET graph:"
+                        "encountered more than %d edges\n",
+                        edges);
+                if (coo) free(coo);
+                return -1;
             }
 
             long long ll_row, ll_col, ll_value;
+            // Value ll_value;  // used for parse float / double
             int num_input;
             if (LOAD_VALUES) {
                 if ((num_input = sscanf(
-                         line, "%lld %lld %lld",
-                         &ll_col, &ll_row, &ll_value)) < 2) {
+                                     line, "%lld %lld %lld",
+                                     &ll_col, &ll_row, &ll_value)) < 2) {
                     fprintf(stderr,
                             "Error parsing MARKET graph: badly formed edge\n");
                     if (coo) free(coo);
                     return -1;
                 } else if (num_input == 2) {
-                    ll_value = 1;
+                    ll_value = rand() % 64;
                 }
             } else {
                 if (sscanf(line, "%lld %lld", &ll_col, &ll_row) != 2) {
@@ -205,7 +205,6 @@ int ReadMarketStream(
                                             undirected, reversed);
 
     free(coo);
-
     fflush(stdout);
 
     return 0;
@@ -216,13 +215,9 @@ int ReadMarketStream(
  *
  */
 template <bool LOAD_VALUES, typename VertexId, typename Value, typename SizeT>
-int ReadCsrArrays(
-    char *f_in,
-    Csr<VertexId, Value, SizeT> &csr_graph,
-    bool undirected,
-    bool reversed)
-{
-    csr_graph.template FromCsr<LOAD_VALUES>(f_in, undirected, reversed);
+int ReadCsrArrays(char *f_in, Csr<VertexId, Value, SizeT> &csr_graph,
+                  bool undirected, bool reversed) {
+    csr_graph.template FromCsr<LOAD_VALUES>(f_in);
     return 0;
 }
 
@@ -249,34 +244,30 @@ int BuildMarketGraph(
     char *output_file,
     Csr<VertexId, Value, SizeT> &csr_graph,
     bool undirected,
-    bool reversed)
-{
+    bool reversed) {
     FILE *_file = fopen(output_file, "r");
-    if (_file)
-    {
+    if (_file) {
         fclose(_file);
         if (ReadCsrArrays<LOAD_VALUES>(
-                output_file, csr_graph, undirected, reversed) != 0) {
+                    output_file, csr_graph, undirected, reversed) != 0) {
             return -1;
         }
-    }
-    else {
+    } else {
         if (mm_filename == NULL) {
             // Read from stdin
             printf("Reading from stdin:\n");
             if (ReadMarketStream<LOAD_VALUES>(
-                    stdin, output_file, csr_graph, undirected, reversed) != 0) {
+                        stdin, output_file, csr_graph, undirected, reversed) != 0) {
                 return -1;
             }
-        }
-        else {
+        } else {
             // Read from file
             FILE *f_in = fopen(mm_filename, "r");
             if (f_in) {
                 printf("Reading from %s:\n", mm_filename);
                 if (ReadMarketStream<LOAD_VALUES>(
-                        f_in, output_file, csr_graph,
-                        undirected, reversed) != 0) {
+                            f_in, output_file, csr_graph,
+                            undirected, reversed) != 0) {
                     fclose(f_in);
                     return -1;
                 }
@@ -299,37 +290,29 @@ int BuildMarketGraph(
     char *file_in,
     Csr<VertexId, Value, SizeT> &graph,
     bool undirected,
-    bool reversed)
-{
+    bool reversed) {
     // seperate the graph path and the file name
     char *temp1 = strdup(file_in);
     char *temp2 = strdup(file_in);
     char *file_path = dirname (temp1);
     char *file_name = basename(temp2);
 
-    if (undirected)
-    {
-        char ud[256];
-        sprintf(ud, "%s/.%s_undirected_csr", file_path, file_name);
-        if (BuildMarketGraph<true>(file_in, ud, graph, true, false) != 0)
+    if (undirected) {
+        char ud[256];  // undirected graph
+        sprintf(ud, "%s/.%s.ud.bin", file_path, file_name);
+        if (BuildMarketGraph<LOAD_VALUES>(file_in, ud, graph, true, false) != 0)
             return 1;
-    }
-    else if (!undirected && reversed)
-    {
-        char rv[256];
-        sprintf(rv, "%s/.%s_reversed_csr", file_path, file_name);
-        if (BuildMarketGraph<true>(file_in, rv, graph, false, true) != 0)
+    } else if (!undirected && reversed) {
+        char rv[256];  // reversed graph
+        sprintf(rv, "%s/.%s.rv.bin", file_path, file_name);
+        if (BuildMarketGraph<LOAD_VALUES>(file_in, rv, graph, false, true) != 0)
             return 1;
-    }
-    else if (!undirected && !reversed)
-    {
-        char nr[256];
-        sprintf(nr, "%s/.%s_nonreversed_csr", file_path, file_name);
-        if (BuildMarketGraph<true>(file_in, nr, graph, false, false) != 0)
+    } else if (!undirected && !reversed) {
+        char di[256];  // directed graph
+        sprintf(di, "%s/.%s.di.bin", file_path, file_name);
+        if (BuildMarketGraph<LOAD_VALUES>(file_in, di, graph, false, false) != 0)
             return 1;
-    }
-    else
-    {
+    } else {
         fprintf(stderr, "Unspecified Graph Type.\n");
     }
     return 0;
diff --git a/gunrock/gunrock.h b/gunrock/gunrock.h
index ee695951f..24d8d421f 100644
--- a/gunrock/gunrock.h
+++ b/gunrock/gunrock.h
@@ -12,7 +12,6 @@
  * The Gunrock public interface is a C-only interface to enable linking
  * with code written in other languages. While the internals of Gunrock
  * are not limited to C.
- *
  */
 
 #include <stdlib.h>
@@ -21,129 +20,191 @@
 /**
  * @brief VertexId data type enumerators.
  */
-enum VertexIdType {
-    VTXID_INT, //!< integer type
+enum VtxIdType {
+    VTXID_INT,  // integer type
 };
 
 /**
  * @brief SizeT data type enumerators.
  */
 enum SizeTType {
-    SIZET_INT, //!< unsigned integer type
+    SIZET_INT,  // unsigned integer type
 };
 
 /**
  * @brief Value data type enumerators.
  */
 enum ValueType {
-    VALUE_INT,   //!< integer type
-    VALUE_UINT,  //!< unsigned int type
-    VALUE_FLOAT, //!< float type
+    VALUE_INT,    // integer type
+    VALUE_UINT,   // unsigned int type
+    VALUE_FLOAT,  // float type
 };
 
 /**
  * @brief data-type configuration used to specify data types
  */
-struct GunrockDataType {
-    enum VertexIdType VTXID_TYPE; //!< VertexId data-type
-    enum SizeTType    SIZET_TYPE; //!< SizeT    data-type
-    enum ValueType    VALUE_TYPE; //!< Value    data-type
+struct GRTypes {
+    enum VtxIdType VTXID_TYPE;  // VertexId data type
+    enum SizeTType SIZET_TYPE;  // SizeT data type
+    enum ValueType VALUE_TYPE;  // Value data type
 };
 
 /**
  * @brief GunrockGraph as a standard graph interface
  */
-struct GunrockGraph {
-    size_t num_nodes;    //!< number of nodes in graph
-    size_t num_edges;    //!< number of edges in graph
-    void   *row_offsets; //!< C.S.R. row offsets
-    void   *col_indices; //!< C.S.R. column indices
-    void   *col_offsets; //!< C.S.C. column offsets
-    void   *row_indices; //!< C.S.C. row indices
-    void   *node_values; //!< associated values per node
-    void   *edge_values; //!< associated values per edge
+struct GRGraph {
+    size_t  num_nodes;  // number of nodes in graph
+    size_t  num_edges;  // number of edges in graph
+    void *row_offsets;  // CSR row offsets
+    void *col_indices;  // CSR column indices
+    void *col_offsets;  // CSC column offsets
+    void *row_indices;  // CSC row indices
+    void *node_values;  // associated values per node
+    void *edge_values;  // associated values per edge
 };
 
 /**
  * @brief Source Vertex Mode enumerators.
  */
 enum SrcMode {
-    manually,       //!< manually set up source node
-    randomize,      //!< random generate source node
-    largest_degree, //!< set to largest-degree node
+    manually,        // manually set up source node
+    randomize,       // random generate source node
+    largest_degree,  // set to largest-degree node
 };
 
 /**
  * @brief arguments configuration used to specify arguments
  */
-struct GunrockConfig {
-    bool  mark_pred;        //!< whether to mark predecessor or not
-    bool  idempotence;      //!< whether or not to enable idempotent
-    int   src_node;         //!< source vertex define where to start
-    int   device;           //!< setting which gpu device to use
-    int   max_iter;         //!< maximum number of iterations allowed
-    int   top_nodes;        //!< k value for topk / page_rank problem
-    int   delta_factor;     //!< sssp delta-factor parameter
-    float delta;            //!< pagerank specific value
-    float error;            //!< pagerank specific value
-    float queue_size;       //!< setting frontier queue size
-    enum  SrcMode src_mode; //!< source mode rand/largest_degree
+struct GRSetup {
+    bool        mark_pred;  // whether to mark predecessor or not
+    bool      idempotence;  // whether or not to enable idempotent
+    int          src_node;  // source vertex define where to start
+    int            device;  // setting which device to use
+    int          max_iter;  // maximum number of iterations allowed
+    int         top_nodes;  // k value for top k / pagerank problem
+    int      delta_factor;  // sssp delta-factor parameter
+    float           delta;  // pagerank specific value
+    float           error;  // pagerank specific value
+    float      queue_size;  // setting frontier queue size
+    enum SrcMode src_mode;  // source mode rand/largest_degree
 };
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-// BFS Function Define
-void gunrock_bfs_func(
-    struct GunrockGraph       *graph_out,
-    const struct GunrockGraph *graph_in,
-    struct GunrockConfig      configs,
-    struct GunrockDataType    data_type);
-
-// BC Function Define
-void gunrock_bc_func(
-    struct GunrockGraph       *graph_out,
-    const struct GunrockGraph *graph_in,
-    struct GunrockConfig      configs,
-    struct GunrockDataType    data_type);
-
-// CC Function Define
-void gunrock_cc_func(
-    struct GunrockGraph       *graph_out,
-    unsigned int              *components,
-    const struct GunrockGraph *graph_in,
-    struct GunrockConfig      configs,
-    struct GunrockDataType    data_type);
-
-// SSSP Function Define
-void gunrock_sssp_func(
-    struct GunrockGraph       *graph_out,
-    void                      *predecessor,
-    const struct GunrockGraph *graph_in,
-    struct GunrockConfig      congis,
-    struct GunrockDataType    data_type);
-
-// PR Function Define
-void gunrock_pr_func(
-    struct GunrockGraph       *graph_out,
-    void                      *node_ids,
-    void                      *page_rank,
-    const struct GunrockGraph *graph_in,
-    struct GunrockConfig      configs,
-    struct GunrockDataType    data_type);
-
-// TopK Function Define
-void gunrock_topk_func(
-    struct GunrockGraph       *graph_out,
-    void                      *node_ids,
-    void                      *in_degrees,
-    void                      *out_degrees,
-    const struct GunrockGraph *graph_in,
-    struct GunrockConfig      configs,
-    struct GunrockDataType    data_type);
-
-// TODO: Add other algorithms
+/**
+ * breath-first search
+ */
+void gunrock_bfs(
+    struct GRGraph*       graph_o,
+    const struct GRGraph* graph_i,
+    const struct GRSetup  config,
+    const struct GRTypes  data_t);
+
+void bfs(
+    int*       bfs_label,
+    const int  num_nodes,
+    const int  num_edges,
+    const int* row_offsets,
+    const int* col_indices,
+    const int  source);
+
+/**
+ * betweenness centrality
+ */
+void gunrock_bc(
+    struct GRGraph*       graph_o,
+    const struct GRGraph* graph_i,
+    const struct GRSetup  config,
+    const struct GRTypes  data_t);
+
+void bc(
+    float*     bc_scores,
+    const int  num_nodes,
+    const int  num_edges,
+    const int* row_offsets,
+    const int* col_indices,
+    const int  source);
+
+/**
+ * connected component
+ */
+void gunrock_cc(
+    struct GRGraph*       graph_o,
+    unsigned int*         components,
+    const struct GRGraph* graph_i,
+    const struct GRSetup  config,
+    const struct GRTypes  data_t);
+
+int cc(
+    int*       component,
+    const int  num_nodes,
+    const int  num_edges,
+    const int* row_offsets,
+    const int* col_indices);
+
+/**
+ * single-source shortest path
+ */
+void gunrock_sssp(
+    struct GRGraph*       graph_o,
+    void*                 predecessor,
+    const struct GRGraph* graph_i,
+    const struct GRSetup  config,
+    const struct GRTypes  data_t);
+
+void sssp(
+    unsigned int*       distances,
+    const int           num_nodes,
+    const int           num_edges,
+    const int*          row_offsets,
+    const int*          col_indices,
+    const unsigned int* edge_values,
+    const int           source);
+
+// pagerank
+void gunrock_pagerank(
+    struct GRGraph*       graph_o,
+    void*                 node_ids,
+    void*                 pagerank,
+    const struct GRGraph* graph_i,
+    const struct GRSetup  config,
+    const struct GRTypes  data_t);
+
+void pagerank(
+    int*       node_ids,
+    float*     pagerank,
+    const int  num_nodes,
+    const int  num_edges,
+    const int* row_offsets,
+    const int* col_indices);
+
+// degree centrality
+void gunrock_topk(
+    struct  GRGraph*      graph_o,
+    void*                 node_ids,
+    void*                 in_degrees,
+    void*                 out_degrees,
+    const struct GRGraph* graph_i,
+    const struct GRSetup  config,
+    const struct GRTypes  data_t);
+
+// minimum spanning tree
+void gunrock_mst(
+    struct GRGraph*       graph_o,
+    const struct GRGraph* graph_i,
+    const struct GRSetup  config,
+    const struct GRTypes  data_t);
+
+void mst(
+    bool*      edge_mask,
+    const int  num_nodes,
+    const int  num_edges,
+    const int* row_offsets,
+    const int* col_indices);
+
+// TODO(ydwu): Add other primitives
 
 #ifdef __cplusplus
 }
diff --git a/gunrock/oprtr/edge_map_partitioned/kernel.cuh b/gunrock/oprtr/edge_map_partitioned/kernel.cuh
index b9634ba8b..ba5295a81 100644
--- a/gunrock/oprtr/edge_map_partitioned/kernel.cuh
+++ b/gunrock/oprtr/edge_map_partitioned/kernel.cuh
@@ -564,7 +564,7 @@ struct Dispatch<KernelPolicy, ProblemData, Functor, true>
         }
 
         // Determine work decomposition
-        if (blockIdx.x == 0 && threadIdx.x == 0) {
+        if (blockIdx.x == 0 && threadIdx.x == 0) { 
 
             // obtain problem size
             if (queue_reset)
@@ -586,10 +586,10 @@ struct Dispatch<KernelPolicy, ProblemData, Functor, true>
             // Reset our next outgoing queue counter to zero
             work_progress.template StoreQueueLength<SizeT>(0, queue_index + 2);
             work_progress.template PrepResetSteal<SizeT>(queue_index + 1);
-        }
+        } 
 
         // Barrier to protect work decomposition
-        __syncthreads();
+        __syncthreads(); 
 
         unsigned int range = input_queue_len;
         int tid = threadIdx.x;
@@ -618,16 +618,16 @@ struct Dispatch<KernelPolicy, ProblemData, Functor, true>
             else
                 s_vertices[tid] = (my_id < range ? d_column_indices[d_queue[my_id]] : max_vertices);
             s_edge_ids[tid] = (my_id < range ? d_queue[my_id] : max_vertices);
-        }
+        } 
 
         __syncthreads();
-        unsigned int size = s_edges[end_id];
+        unsigned int size = s_edges[end_id]; 
 
         VertexId v, e, e_id;
         int v_index = BinarySearch<KernelPolicy::THREADS>(tid, s_edges);
         v = s_vertices[v_index];
         e_id = s_edge_ids[v_index];
-        int end_last = (v_index < KernelPolicy::THREADS ? s_edges[v_index] : max_vertices);
+        int end_last = (v_index < KernelPolicy::THREADS ? s_edges[v_index] : max_vertices); 
 
         for (int i = tid; i < size; i += KernelPolicy::THREADS)
         {
@@ -726,7 +726,7 @@ struct Dispatch<KernelPolicy, ProblemData, Functor, true>
                         }
                     }
                 }
-            } else {
+            } else { 
                 //v:pre, u:neighbor, outoffset:offset+i
                 if (Functor::CondEdge(v, u, problem, lookup, e_id)) {
                     Functor::ApplyEdge(v, u, problem, lookup, e_id);
diff --git a/gunrock/util/select_utils.cuh b/gunrock/util/select_utils.cuh
index 2db66ca14..1da71e8fa 100644
--- a/gunrock/util/select_utils.cuh
+++ b/gunrock/util/select_utils.cuh
@@ -18,136 +18,104 @@
 namespace gunrock {
 namespace util {
 
-    /**
-     * \addtogroup PublicInterface
-     * @{
-     */
-
-    //---------------------------------------------------------------------
-    // Globals, constants and typedefs
-    //---------------------------------------------------------------------
-    struct GreaterThan
-    {
-	int compare;
-
-	__host__ __device__ __forceinline__
-	GreaterThan(int compare) : compare(compare) { }
-
-	__host__ __device__ __forceinline__
-	bool operator()(const int &a) const { return (a > compare); }
-    };
-
-    /**
-     * @brief selects items from from a sequence of int keys using a
-     * section functor (greater-than)
-     *
-     */
-    template <typename VertexId, typename SizeT>
-    cudaError_t CUBSelect(
-	VertexId  *d_input,
-	SizeT     num_elements,
-	VertexId  *d_output,
-	unsigned int *num_selected)
-    {
-	cudaError_t retval = cudaSuccess;
-
-	/*
-	  VertexId *input  = NULL;
-	  VertexId *output = NULL;
-
-	  if (util::GRError((retval = cudaMalloc(
-	  &input, sizeof(VertexId)*d_num_elements)),
-	  "CUBSelect input malloc failed",
-	  __FILE__, __LINE__)) return retval;
-	  if (util::GRError((retval = cudaMalloc(
-	  &output, sizeof(VertexId)*d_num_elements)),
-	  "CUBSelect output malloc failed",
-	  __FILE__, __LINE__)) return retval;
-
-	  cub::DoubleBuffer<VertexId> d_input_buffer(d_input, input);
-	  cub::DoubleBuffer<VertexId> d_output_buffer(d_output, output);
-	*/
-
-	unsigned int *d_num_selected = NULL;
-	if (util::GRError((retval = cudaMalloc(
-	    (void**)&d_num_selected, sizeof(unsigned int))),
-	    "CUBSelect d_num_selected malloc failed",
-	    __FILE__, __LINE__)) return retval;
-
-	void  *d_temp_storage = NULL;
-	size_t temp_storage_bytes = 0;
-	GreaterThan select_op(-1);
-
-	// determine temporary device storage requirements
-	if (util::GRError((retval = cub::DeviceSelect::If(
-	    d_temp_storage,
-	    temp_storage_bytes,
-	    d_input,
-	    d_output,
-	    d_num_selected,
-	    num_elements,
-	    select_op)),
-	    "CUBSelect cub::DeviceSelect::If failed",
-	    __FILE__, __LINE__)) return retval;
-
-	// allocate temporary storage
-	if (util::GRError((retval = cudaMalloc(
-	    &d_temp_storage, temp_storage_bytes)),
-	    "CUBSelect malloc d_temp_storage failed",
-	    __FILE__, __LINE__)) return retval;
-
-	// run selection
-	if (util::GRError((retval = cub::DeviceSelect::If(
-	    d_temp_storage,
-	    temp_storage_bytes,
-	    d_input,
-	    d_output,
-	    d_num_selected,
-	    num_elements,
-	    select_op)),
+/**
+ * \addtogroup PublicInterface
+ * @{
+ */
+
+//---------------------------------------------------------------------
+// Globals, constants and typedefs
+//---------------------------------------------------------------------
+struct GreaterThan
+{
+    int compare;
+
+    __host__ __device__ __forceinline__
+    GreaterThan(int compare) : compare(compare) { }
+
+    __host__ __device__ __forceinline__
+    bool operator()(const int &a) const { return (a > compare); }
+};
+
+/**
+ * @brief selects items from from a sequence of int keys using a
+ * section functor (greater-than)
+ *
+ */
+template <typename T, typename SizeT>
+cudaError_t CUBSelect(
+    T            *d_input,
+    SizeT         num_elements,
+    T     *d_output,
+    unsigned int *num_selected)
+{
+    cudaError_t retval = cudaSuccess;
+    unsigned int *d_num_selected = NULL;
+
+    if (util::GRError(
+            (retval = cudaMalloc((void**)&d_num_selected, sizeof(unsigned int))),
+            "CUBSelect d_num_selected malloc failed",
+            __FILE__, __LINE__)) return retval;
+
+    void *d_temp_storage = NULL;
+    size_t temp_storage_bytes = 0;
+    GreaterThan select_op(-1);
+
+    // determine temporary device storage requirements
+    if (util::GRError(
+            (retval = cub::DeviceSelect::If(
+                d_temp_storage,
+                temp_storage_bytes,
+                d_input,
+                d_output,
+                d_num_selected,
+                num_elements,
+                select_op)),
             "CUBSelect cub::DeviceSelect::If failed",
             __FILE__, __LINE__)) return retval;
 
-	/*
-	// copy back output
-	if (util::GRError((retval = cudaMemcpy(
-	d_output,
-	d_output_buffer.Current(),
-	sizeof(VertexId)*(*d_num_selected),
-	cudaMemcpyDeviceToDevice)),
-	"CUBSelect copy back output failed",
-	__FILE__, __LINE__)) return retval;
-	*/
-
-	if (util::GRError((retval = cudaMemcpy(
-	    num_selected,
-	    d_num_selected,
-	    sizeof(unsigned int),
-	    cudaMemcpyDeviceToHost)),
-	    "CUBSelect copy back num_selected failed",
-	    __FILE__, __LINE__)) return retval;
-
-	// clean up
-	if (util::GRError((retval = cudaFree(d_temp_storage)),
-	    "CUBSelect free d_temp_storage failed",
-	    __FILE__, __LINE__)) return retval;
-	if (util::GRError((retval = cudaFree(d_num_selected)),
-            "CUBSelect free d_num_selected failed",
-	    __FILE__, __LINE__)) return retval;
+    // allocate temporary storage
+    if (util::GRError(
+            (retval = cudaMalloc(&d_temp_storage, temp_storage_bytes)),
+            "CUBSelect malloc d_temp_storage failed",
+            __FILE__, __LINE__)) return retval;
+
+    // run selection
+    if (util::GRError(
+            (retval = cub::DeviceSelect::If(
+                d_temp_storage,
+                temp_storage_bytes,
+                d_input,
+                d_output,
+                d_num_selected,
+                num_elements,
+                select_op)),
+            "CUBSelect cub::DeviceSelect::If failed",
+            __FILE__, __LINE__)) return retval;
 
-	/*
-	  if (util::GRError((retval = cudaFree(input)),
-	  "CUBSelect free input failed",
-	  __FILE__, __LINE__)) return retval;
-	  if (util::GRError((retval = cudaFree(output)),
-	  "CUBSelect free output failed",
-	  __FILE__, __LINE__)) return retval;
-	*/
+    if (util::GRError(
+            (retval = cudaMemcpy(
+                num_selected,
+                d_num_selected,
+                sizeof(unsigned int),
+                cudaMemcpyDeviceToHost)),
+            "CUBSelect copy back num_selected failed",
+            __FILE__, __LINE__)) return retval;
+
+    // clean up
+    if (util::GRError(
+            (retval = cudaFree(d_temp_storage)),
+            "CUBSelect free d_temp_storage failed",
+            __FILE__, __LINE__)) return retval;
+    if (util::GRError(
+            (retval = cudaFree(d_num_selected)),
+            "CUBSelect free d_num_selected failed",
+            __FILE__, __LINE__)) return retval;
 
-	return retval;
-    }
+    return retval;
+}
 
-    /** @} */
+/** @} */
 
 } //util
 } //gunrock
diff --git a/gunrock/util/test_utils.cuh b/gunrock/util/test_utils.cuh
index 8e62f63cd..491b2136e 100644
--- a/gunrock/util/test_utils.cuh
+++ b/gunrock/util/test_utils.cuh
@@ -251,6 +251,44 @@ void DisplayDeviceResults(
     if (h_data) free(h_data);
 }
 
+/**
+ * Verify the contents of a device array match those
+ * of a host array
+ */
+template <typename DATATYPE, typename INDEXTYPE>
+void DisplayDeviceResults(
+    DATATYPE *d_data,
+    INDEXTYPE *d_indices,
+    size_t num_elements,
+    size_t num_indices)
+{
+    printf("num_elements:%d\n", num_elements);
+    printf("num_indices:%d\n", num_indices);
+    // Allocate array on host
+    DATATYPE *h_data = (DATATYPE*) malloc(num_elements * sizeof(DATATYPE));
+    INDEXTYPE *h_indices = (INDEXTYPE*) malloc(num_indices * sizeof(INDEXTYPE));
+
+    // Reduction data back
+    cudaMemcpy(h_data, d_data, sizeof(DATATYPE) * num_elements, cudaMemcpyDeviceToHost);
+    cudaMemcpy(h_indices, d_indices, sizeof(INDEXTYPE) * num_indices, cudaMemcpyDeviceToHost);
+
+    // Display data
+    printf("\n\nData:\n");
+    for (int i = 0; i < num_indices; i++)
+    {
+        PrintValue(h_indices[i]);
+        printf(":");
+        assert(h_indices[i] < num_elements);
+        PrintValue(h_data[h_indices[i]]);
+        printf(", ");
+    }
+    printf("\n\n");
+
+    // Cleanup
+    if (h_data) free(h_data);
+    if (h_indices) free(h_indices);
+}
+
 /******************************************************************************
  * Timing
  ******************************************************************************/
@@ -423,7 +461,8 @@ int CompareResults(
                 is_right = false;
             }
         }
-        if (!is_right && flag == 0)
+        
+        if (!is_right)
         {
             printf("\nINCORRECT: [%lu]: ", (unsigned long) i);
             PrintValue<float>(computed[i]);
@@ -448,7 +487,6 @@ int CompareResults(
                 printf("...]");
             }
             flag += 1;
-            //return flag;
         }
         if (!is_right && flag > 0) flag += 1;
     }
diff --git a/gunrock/util/test_utils.h b/gunrock/util/test_utils.h
index 41c27c04b..f3625bdfe 100644
--- a/gunrock/util/test_utils.h
+++ b/gunrock/util/test_utils.h
@@ -19,16 +19,17 @@
     #undef small            // Windows is terrible for polluting macro namespace
 #else
     #include <sys/resource.h>
-    #include <time.h>
 #endif
 
 #include <stdio.h>
 #include <math.h>
 #include <float.h>
 
+#include <cassert>
 #include <map>
 #include <string>
 #include <vector>
+#include <stack>
 #include <sstream>
 #include <iostream>
 #include <fstream>
@@ -217,59 +218,8 @@ struct CpuTimer
         return (stop - start) * 1000;
     }
 
-/*#elif defined(CLOCK_PROCESS_CPUTIME_ID)
-
-    timespec start;
-    timespec stop;
-
-    void Start()
-    {
-        clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &start);
-    }
-
-    void Stop()
-    {
-        clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &stop);
-    }
-
-    float ElapsedMillis()
-    {
-        timespec temp;
-        if ((stop.tv_nsec-start.tv_nsec)<0) {
-            temp.tv_sec = stop.tv_sec-start.tv_sec-1;
-            temp.tv_nsec = 1000000000+stop.tv_nsec-start.tv_nsec;
-        } else {
-            temp.tv_sec = stop.tv_sec-start.tv_sec;
-            temp.tv_nsec = stop.tv_nsec-start.tv_nsec;
-        }
-        return temp.tv_nsec/1000000.0;
-    }*/
-
 #else
 
-    /*
-    rusage start;
-    rusage stop;
-
-    void Start()
-    {
-        getrusage(RUSAGE_SELF, &start);
-    }
-
-    void Stop()
-    {
-        getrusage(RUSAGE_SELF, &stop);
-    }
-
-    float ElapsedMillis()
-    {
-        float sec = stop.ru_utime.tv_sec - start.ru_utime.tv_sec;
-        float usec = stop.ru_utime.tv_usec - start.ru_utime.tv_usec;
-
-        return (sec * 1000) + (usec / 1000);
-    }
-    */
-
     boost::timer::cpu_timer::cpu_timer cpu_t;
 
     void Start()
diff --git a/python/betweenness_centrality.py b/python/betweenness_centrality.py
new file mode 100644
index 000000000..e7978d04f
--- /dev/null
+++ b/python/betweenness_centrality.py
@@ -0,0 +1,26 @@
+### sample python interface - betweenness centrality
+
+from ctypes import *
+
+### load gunrock shared library - libgunrock
+gunrock = cdll.LoadLibrary('../../build/lib/libgunrock.so')
+
+### read in input CSR arrays from files
+row_list = [int(x.strip()) for x in open('toy_graph/row.txt')]
+col_list = [int(x.strip()) for x in open('toy_graph/col.txt')]
+
+### convert CSR graph inputs for gunrock input
+row = pointer((c_int * len(row_list))(*row_list))
+col = pointer((c_int * len(col_list))(*col_list))
+nodes = len(row_list) - 1
+edges = len(col_list)
+
+### output array
+scores = pointer((c_float * nodes)())
+
+### call gunrock function on device
+gunrock.bc(scores, nodes, edges, row, col, -1)
+
+### sample results
+print ' node bc scores:',
+for idx in range(nodes): print scores[0][idx],
diff --git a/python/breath_first_search.py b/python/breath_first_search.py
new file mode 100644
index 000000000..b67fe80c0
--- /dev/null
+++ b/python/breath_first_search.py
@@ -0,0 +1,26 @@
+### sample python interface - breath-first search
+
+from ctypes import *
+
+### load gunrock shared library - libgunrock
+gunrock = cdll.LoadLibrary('../../build/lib/libgunrock.so')
+
+### read in input CSR arrays from files
+row_list = [int(x.strip()) for x in open('toy_graph/row.txt')]
+col_list = [int(x.strip()) for x in open('toy_graph/col.txt')]
+
+### convert CSR graph inputs for gunrock input
+row = pointer((c_int * len(row_list))(*row_list))
+col = pointer((c_int * len(col_list))(*col_list))
+nodes = len(row_list) - 1
+edges = len(col_list)
+
+### output array
+labels = pointer((c_int * nodes)())
+
+### call gunrock function on device
+gunrock.bfs(labels, nodes, edges, row, col, 0)
+
+### sample results
+print ' bfs labels (depth):',
+for idx in range(nodes): print labels[0][idx],
diff --git a/python/connected_components.py b/python/connected_components.py
new file mode 100644
index 000000000..89fd824cb
--- /dev/null
+++ b/python/connected_components.py
@@ -0,0 +1,27 @@
+### sample python interface - connected components
+
+from ctypes import *
+
+### load gunrock shared library - libgunrock
+gunrock = cdll.LoadLibrary('../../build/lib/libgunrock.so')
+
+### read in input CSR arrays from files
+row_list = [int(x.strip()) for x in open('toy_graph/row.txt')]
+col_list = [int(x.strip()) for x in open('toy_graph/col.txt')]
+
+### convert CSR graph inputs for gunrock input
+row = pointer((c_int * len(row_list))(*row_list))
+col = pointer((c_int * len(col_list))(*col_list))
+nodes = len(row_list) - 1
+edges = len(col_list)
+
+### output array
+labels = pointer((c_int * nodes)())
+
+### call gunrock function on device
+num_components = gunrock.cc(labels, nodes, edges, row, col)
+
+### sample results
+print ' number of components: ' + str(num_components)
+print ' component ids:',
+for idx in range(nodes): print labels[0][idx],
diff --git a/python/pagerank.py b/python/pagerank.py
new file mode 100644
index 000000000..642fa2e12
--- /dev/null
+++ b/python/pagerank.py
@@ -0,0 +1,29 @@
+### sample python interface - pagerank
+
+from ctypes import *
+
+### load gunrock shared library - libgunrock
+gunrock = cdll.LoadLibrary('../../build/lib/libgunrock.so')
+
+### read in input CSR arrays from files
+row_list = [int(x.strip()) for x in open('toy_graph/row.txt')]
+col_list = [int(x.strip()) for x in open('toy_graph/col.txt')]
+
+### convert CSR graph inputs for gunrock input
+row = pointer((c_int * len(row_list))(*row_list))
+col = pointer((c_int * len(col_list))(*col_list))
+nodes = len(row_list) - 1
+edges = len(col_list)
+
+### output array
+node = pointer((c_int * nodes)())
+rank = pointer((c_float * nodes)())
+
+### call gunrock function on device
+gunrock.pagerank(node, rank, nodes, edges, row, col)
+
+### sample results
+print 'top page rank:'
+for idx in range(nodes):
+    print node[0][idx],
+    print rank[0][idx]
diff --git a/python/single_source_shortest_path.py b/python/single_source_shortest_path.py
new file mode 100644
index 000000000..69edc2b39
--- /dev/null
+++ b/python/single_source_shortest_path.py
@@ -0,0 +1,28 @@
+### sample python interface - single-source shortest path
+
+from ctypes import *
+
+### load gunrock shared library - libgunrock
+gunrock = cdll.LoadLibrary('../../build/lib/libgunrock.so')
+
+### read in input CSR arrays from files
+row_list = [int(x.strip()) for x in open('toy_graph/row.txt')]
+col_list = [int(x.strip()) for x in open('toy_graph/col.txt')]
+val_list = [int(x.strip()) for x in open('toy_graph/val.txt')]
+
+### convert CSR graph inputs for gunrock input
+row = pointer((c_int  * len(row_list))(*row_list))
+col = pointer((c_int  * len(col_list))(*col_list))
+val = pointer((c_uint * len(val_list))(*val_list))
+nodes = len(row_list) - 1
+edges = len(col_list)
+
+### output array
+labels = pointer((c_uint * nodes)())
+
+### call gunrock function on device
+gunrock.sssp(labels, nodes, edges, row, col, val, 0)
+
+### sample results
+print ' sssp labels (distance):',
+for idx in range(nodes): print labels[0][idx],
diff --git a/python/toy_graph/col.txt b/python/toy_graph/col.txt
new file mode 100644
index 000000000..12c10b45e
--- /dev/null
+++ b/python/toy_graph/col.txt
@@ -0,0 +1,26 @@
+1
+2
+3
+0
+2
+4
+0
+1
+3
+4
+5
+0
+2
+5
+6
+1
+2
+5
+6
+2
+3
+4
+6
+3
+4
+5
diff --git a/python/toy_graph/row.txt b/python/toy_graph/row.txt
new file mode 100644
index 000000000..1a84c1d97
--- /dev/null
+++ b/python/toy_graph/row.txt
@@ -0,0 +1,8 @@
+0
+3
+6
+11
+15
+19
+23
+26
diff --git a/python/toy_graph/val.txt b/python/toy_graph/val.txt
new file mode 100644
index 000000000..15282b913
--- /dev/null
+++ b/python/toy_graph/val.txt
@@ -0,0 +1,26 @@
+3
+4
+5
+3
+5
+7
+4
+5
+7
+8
+9
+5
+7
+10
+11
+7
+8
+11
+12
+9
+10
+11
+13
+11
+12
+13
diff --git a/shared_lib_tests/CMakeLists.txt b/shared_lib_tests/CMakeLists.txt
index 3d3f638db..7d880fae6 100644
--- a/shared_lib_tests/CMakeLists.txt
+++ b/shared_lib_tests/CMakeLists.txt
@@ -1,20 +1,26 @@
 # gunrock test rig cmake file
 # include_directories(${gunrock_INCLUDE_DIRS}/gunrock)
 
-add_executable (test_topk test_topk.c)
+add_executable(simple_interface_test simple_interface_test.c)
+target_link_libraries(simple_interface_test gunrock)
+
+add_executable(test_topk test_topk.c)
 target_link_libraries(test_topk gunrock)
 
-add_executable (test_bfs test_bfs.c)
+add_executable(test_bfs test_bfs.c)
 target_link_libraries(test_bfs gunrock)
 
-add_executable (test_bc test_bc.c)
+add_executable(test_bc test_bc.c)
 target_link_libraries(test_bc gunrock)
 
-add_executable (test_cc test_cc.c)
+add_executable(test_cc test_cc.c)
 target_link_libraries(test_cc gunrock)
 
-add_executable (test_sssp test_sssp.c)
+add_executable(test_sssp test_sssp.c)
 target_link_libraries(test_sssp gunrock)
 
-add_executable (test_pr test_pr.c)
-target_link_libraries(test_pr gunrock)
\ No newline at end of file
+add_executable(test_pr test_pr.c)
+target_link_libraries(test_pr gunrock)
+
+add_executable(test_mst test_mst.c)
+target_link_libraries(test_mst gunrock)
\ No newline at end of file
diff --git a/shared_lib_tests/simple_interface_test.c b/shared_lib_tests/simple_interface_test.c
new file mode 100644
index 000000000..66cd14c70
--- /dev/null
+++ b/shared_lib_tests/simple_interface_test.c
@@ -0,0 +1,83 @@
+/**
+ * @brief Simple test for shared library simple interface
+ * @file simple_interface_test.c
+ */
+
+#include <stdio.h>
+#include <gunrock/gunrock.h>
+
+int main(int argc, char* argv[]) {
+
+    ///////////////////////////////////////////////////////////////////////////
+    // define input graph
+    int row_offsets[] = {
+        0, 3, 6, 11, 15, 19, 23, 26};
+    int col_indices[] = {
+        1, 2, 3, 0, 2, 4, 0, 1, 3, 4, 5, 0, 2,
+        5, 6, 1, 2, 5, 6, 2, 3, 4, 6, 3, 4, 5};
+    unsigned int edge_values[] = {
+        3, 4, 5, 3, 5, 7, 4, 5, 7, 8, 9, 5, 7, 10,
+        11, 7, 8, 11, 12, 9, 10, 11, 13, 11, 12, 13};
+
+    // nodes = length of row offsets-1, edges = length of column indices
+    size_t num_nodes = sizeof(row_offsets) / sizeof(row_offsets[0]) - 1;
+    size_t num_edges = sizeof(col_indices) / sizeof(col_indices[0]);
+
+    ///////////////////////////////////////////////////////////////////////////
+    // allocate host arrays to store test results
+    int*   bfs_label = (  int*)malloc(sizeof(  int) * num_nodes);
+    float* bc_scores = (float*)malloc(sizeof(float) * num_nodes);
+    int*   conn_comp = (  int*)malloc(sizeof(  int) * num_nodes);
+    unsigned int *sssp_dist =
+        (unsigned int*)malloc(sizeof( unsigned int) * num_nodes);
+    int*    pr_nodes = (  int*)malloc(sizeof(  int) * num_nodes);
+    float*  pr_ranks = (float*)malloc(sizeof(float) * num_nodes);
+
+    ///////////////////////////////////////////////////////////////////////////
+    printf("\n testing breath-first search ...\n");
+    bfs(bfs_label, num_nodes, num_edges, row_offsets, col_indices, 0);
+    int node; for (node = 0; node < num_nodes; ++node) {
+        printf(" node: [%d] | label (depth): [%d]\n", node, bfs_label[node]);
+    }
+
+    ///////////////////////////////////////////////////////////////////////////
+    printf("\n testing betweenness centrality ...\n");
+    bc(bc_scores, num_nodes, num_edges, row_offsets, col_indices, -1);
+    for (node = 0; node < num_nodes; ++node) {
+        printf(" node: [%d] | score: [%.4f]\n", node, bc_scores[node]);
+    }
+
+    ///////////////////////////////////////////////////////////////////////////
+    printf("\n testing connected components ...\n");
+    int num_comp = cc(conn_comp, num_nodes, num_edges, row_offsets, col_indices);
+    printf(" total number of components: %d\n", num_comp);
+    for (node = 0; node < num_nodes; ++node) {
+        printf(" node: [%d] | component: [%d]\n", node, conn_comp[node]);
+    }
+
+    ///////////////////////////////////////////////////////////////////////////
+    printf("\n testing single-source shortest path ...\n");
+    sssp(sssp_dist, num_nodes, num_edges, row_offsets, col_indices, edge_values, 0);
+    for (node = 0; node < num_nodes; ++node) {
+        printf(" node: [%d] | component: [%d]\n", node, sssp_dist[node]);
+    }
+
+    ///////////////////////////////////////////////////////////////////////////
+    printf("\n testing pagerank ...\n");
+    pagerank(pr_nodes, pr_ranks, num_nodes, num_edges, row_offsets, col_indices);
+    for (node = 0; node < num_nodes; ++node) {
+      printf(" node: [%d] | rank: [%.4f]\n", pr_nodes[node], pr_ranks[node]);
+    }
+
+    // TODO(ydwu): add other primitive tests
+
+    // clean ups
+    if (bfs_label) free(bfs_label);
+    if (bc_scores) free(bc_scores);
+    if (conn_comp) free(conn_comp);
+    if (sssp_dist) free(sssp_dist);
+    if (pr_nodes)   free(pr_nodes);
+    if (pr_ranks)   free(pr_ranks);
+
+    return 0;
+}
diff --git a/shared_lib_tests/test_bc.c b/shared_lib_tests/test_bc.c
index 0eb4fdf0f..177585a58 100644
--- a/shared_lib_tests/test_bc.c
+++ b/shared_lib_tests/test_bc.c
@@ -1,76 +1,65 @@
 /**
  * @brief BC test for shared library
  * @file test_bc.c
- *
- * set input graph, configs and call function gunrock_bc_func
- * return per node label values in graph_out node_values
  */
 
 #include <stdio.h>
 #include <gunrock/gunrock.h>
 
-int main(int argc, char* argv[])
-{
-  // define data types
-  struct GunrockDataType data_type;
-  data_type.VTXID_TYPE = VTXID_INT;
-  data_type.SIZET_TYPE = SIZET_INT;
-  data_type.VALUE_TYPE = VALUE_FLOAT;
+int main(int argc, char* argv[]) {
+    // define data types
+    struct GRTypes data_t;
+    data_t.VTXID_TYPE = VTXID_INT;
+    data_t.SIZET_TYPE = SIZET_INT;
+    data_t.VALUE_TYPE = VALUE_FLOAT;
 
-  // bc configurations (optional)
-  struct GunrockConfig bc_config;
-  bc_config.device       =    0;
-  bc_config.src_node     =   -1;     //!< source vertex to begin search
-  bc_config.queue_size   = 1.0f;
-  bc_config.src_mode = manually;
+    // bc configurations (optional)
+    struct GRSetup config;
+    config.device     =    0;
+    config.src_node   =   -1;  // source vertex to begin search
+    config.queue_size = 1.0f;
+    config.src_mode   = manually;
 
-  // define graph (undirected graph)
-  size_t num_nodes = 7;
-  size_t num_edges = 26;
-  int row_offsets[8] = {0, 3, 6, 11, 15, 19, 23, 26};
-  int col_indices[26] = {1, 2, 3, 0, 2, 4, 0, 1, 3, 4, 5, 0, 2,
-                         5, 6, 1, 2, 5, 6, 2, 3, 4, 6, 3, 4, 5};
+    // define graph (undirected graph)
+    size_t num_nodes    = 7;
+    size_t num_edges    = 26;
+    int row_offsets[8]  = {0, 3, 6, 11, 15, 19, 23, 26};
+    int col_indices[26] = {1, 2, 3, 0, 2, 4, 0, 1, 3, 4, 5, 0, 2,
+                           5, 6, 1, 2, 5, 6, 2, 3, 4, 6, 3, 4, 5};
 
-  // build graph as input
-  struct GunrockGraph *graph_input =
-    (struct GunrockGraph*)malloc(sizeof(struct GunrockGraph));
-  graph_input->num_nodes   = num_nodes;
-  graph_input->num_edges   = num_edges;
-  graph_input->row_offsets = (void*)&row_offsets[0];
-  graph_input->col_indices = (void*)&col_indices[0];
+    // build graph as input
+    struct GRGraph *graph_i = (struct GRGraph*)malloc(sizeof(struct GRGraph));
+    graph_i->num_nodes   = num_nodes;
+    graph_i->num_edges   = num_edges;
+    graph_i->row_offsets = (void*)&row_offsets[0];
+    graph_i->col_indices = (void*)&col_indices[0];
 
-  // malloc output graph
-  struct GunrockGraph *graph_output =
-    (struct GunrockGraph*)malloc(sizeof(struct GunrockGraph));
+    // malloc output graph
+    struct GRGraph *graph_o = (struct GRGraph*)malloc(sizeof(struct GRGraph));
 
-  // run bc calculations
-  gunrock_bc_func(
-    graph_output,
-    graph_input,
-    bc_config,
-    data_type);
+    // run bc calculations
+    gunrock_bc(graph_o, graph_i, config, data_t);
 
-  // test print
-  int i;
-  printf("Demo Outputs:\n");
-  // print per node betweeness centrality values
-  float *bc_vals = (float*)malloc(sizeof(float) * graph_input->num_nodes);
-  bc_vals = (float*)graph_output->node_values;
-  for (i = 0; i < graph_input->num_nodes; ++i)
-  {
-    printf("Node_ID [%d] : BC[%f]\n", i, bc_vals[i]);
-  }
-  printf("\n");
-  // print per edge betweeness centrality values
-  float *ebc_vals = (float*)malloc(sizeof(float)*graph_input->num_edges);
-  ebc_vals = (float*)graph_output->edge_values;
-  for (i = 0; i < graph_input->num_edges; ++i)
-  {
-    printf("Edge_ID [%d] : EBC[%f]\n", i, ebc_vals[i]);
-  }
+    // test print
+    int i;
+    printf("Demo Outputs:\n");
+    // print per node betweeness centrality values
+    float *bc_vals = (float*)malloc(sizeof(float) * graph_i->num_nodes);
+    bc_vals = (float*)graph_o->node_values;
+    for (i = 0; i < graph_i->num_nodes; ++i) {
+        printf("Node_ID [%d] : BC[%f]\n", i, bc_vals[i]);
+    }
+    printf("\n");
+    // print per edge betweeness centrality values
+    float *ebc_vals = (float*)malloc(sizeof(float) * graph_i->num_edges);
+    ebc_vals = (float*)graph_o->edge_values;
+    for (i = 0; i < graph_i->num_edges; ++i) {
+        printf("Edge_ID [%d] : EBC[%f]\n", i, ebc_vals[i]);
+    }
 
-  if (graph_input)  { free(graph_input);  }
-  if (graph_output) { free(graph_output); }
+    // clean up
+    if (graph_i) { free(graph_i); }
+    if (graph_o) { free(graph_o); }
 
-  return 0;
+    return 0;
 }
diff --git a/shared_lib_tests/test_bfs.c b/shared_lib_tests/test_bfs.c
index d3f57b747..11b43b2a5 100644
--- a/shared_lib_tests/test_bfs.c
+++ b/shared_lib_tests/test_bfs.c
@@ -1,69 +1,59 @@
 /**
  * @brief BFS test for shared library
  * @file test_bfs.c
- *
- * set input graph, configs and call function gunrock_bfs_func
- * return per node label values in graph_out node_values
  */
 
 #include <stdio.h>
 #include <gunrock/gunrock.h>
 
-int main(int argc, char* argv[])
-{
-  // define data types
-  struct GunrockDataType data_type;
-  data_type.VTXID_TYPE = VTXID_INT;
-  data_type.SIZET_TYPE = SIZET_INT;
-  data_type.VALUE_TYPE = VALUE_INT;
-
-  // bfs configurations (optional)
-  struct GunrockConfig bfs_config;
-  bfs_config.device      = 0;
-  bfs_config.src_mode    = randomize;
-  bfs_config.src_node    = 1;     //!< source vertex to begin search
-  bfs_config.mark_pred   = false; //!< do not mark predecessors
-  bfs_config.idempotence = false; //!< wether enable idempotence
-  bfs_config.queue_size  = 1.0f;
-
-  // define graph
-  size_t num_nodes = 7;
-  size_t num_edges = 15;
-  int row_offsets[8] = {0,3,6,9,11,14,15,15};
-  int col_indices[15] = {1,2,3,0,2,4,3,4,5,5,6,2,5,6,6};
-
-  // build graph as input
-  struct GunrockGraph *graph_input =
-    (struct GunrockGraph*)malloc(sizeof(struct GunrockGraph));
-  graph_input->num_nodes   = num_nodes;
-  graph_input->num_edges   = num_edges;
-  graph_input->row_offsets = (void*)&row_offsets[0];
-  graph_input->col_indices = (void*)&col_indices[0];
-
-  // malloc output graph
-  struct GunrockGraph *graph_output =
-    (struct GunrockGraph*)malloc(sizeof(struct GunrockGraph));
-
-  // run bfs calculations
-  gunrock_bfs_func(
-    graph_output,
-    graph_input,
-    bfs_config,
-    data_type);
-
-  // test print
-  int i;
-  printf("Demo Outputs:\n");
-  int *labels = (int*)malloc(sizeof(int) * graph_input->num_nodes);
-  labels = (int*)graph_output->node_values;
-  for (i = 0; i < graph_input->num_nodes; ++i)
-  {
-    printf("Node_ID [%d] : Label [%d]\n", i, labels[i]);
-  }
-
-  if (graph_input)  { free(graph_input);  }
-  if (graph_output) { free(graph_output); }
-  if (labels)       { free(labels);       }
-
-  return 0;
+int main(int argc, char* argv[]) {
+    // define data types
+    struct GRTypes data_t;
+    data_t.VTXID_TYPE = VTXID_INT;
+    data_t.SIZET_TYPE = SIZET_INT;
+    data_t.VALUE_TYPE = VALUE_INT;
+
+    // bfs configurations (optional)
+    struct GRSetup config;
+    config.device      = 0;
+    config.src_mode    = randomize;
+    config.src_node    = 1;      // source vertex to begin search
+    config.mark_pred   = false;  // do not mark predecessors
+    config.idempotence = false;  // wether enable idempotence
+    config.queue_size  = 1.0f;
+
+    // define graph
+    size_t num_nodes    = 7;
+    size_t num_edges    = 15;
+    int row_offsets[8]  = {0, 3, 6, 9, 11, 14, 15, 15};
+    int col_indices[15] = {1, 2, 3, 0, 2, 4, 3, 4, 5, 5, 6, 2, 5, 6, 6};
+
+    // build graph as input
+    struct GRGraph *graph_i = (struct GRGraph*)malloc(sizeof(struct GRGraph));
+    graph_i->num_nodes   = num_nodes;
+    graph_i->num_edges   = num_edges;
+    graph_i->row_offsets = (void*)&row_offsets[0];
+    graph_i->col_indices = (void*)&col_indices[0];
+
+    // malloc output graph
+    struct GRGraph *graph_o = (struct GRGraph*)malloc(sizeof(struct GRGraph));
+
+    // run bfs calculations
+    gunrock_bfs(graph_o, graph_i, config, data_t);
+
+    // test print
+    int i;
+    printf("Demo Outputs:\n");
+    int *labels = (int*)malloc(sizeof(int) * graph_i->num_nodes);
+    labels = (int*)graph_o->node_values;
+    for (i = 0; i < graph_i->num_nodes; ++i) {
+        printf("Node_ID [%d] : Label [%d]\n", i, labels[i]);
+    }
+
+    // clean up
+    if (graph_i) { free(graph_i); }
+    if (graph_o) { free(graph_o); }
+    if (labels)  { free(labels);  }
+
+    return 0;
 }
diff --git a/shared_lib_tests/test_cc.c b/shared_lib_tests/test_cc.c
index a230619b9..0dbd67bc1 100644
--- a/shared_lib_tests/test_cc.c
+++ b/shared_lib_tests/test_cc.c
@@ -1,66 +1,55 @@
 /**
  * @brief CC test for shared library
  * @file test_cc.c
- *
- * set input graph, configs and call function gunrock_cc_func
- * return per node label values in graph_out node_values
  */
 
 #include <stdio.h>
 #include <gunrock/gunrock.h>
 
-int main(int argc, char* argv[])
-{
-  // define data types
-  struct GunrockDataType data_type;
-  data_type.VTXID_TYPE = VTXID_INT;
-  data_type.SIZET_TYPE = SIZET_INT;
-  data_type.VALUE_TYPE = VALUE_INT;
-
-  // connected component configurations
-  struct GunrockConfig configs;
-  configs.device = 0;
-
-  // define graph
-  size_t num_nodes = 7;
-  size_t num_edges = 15;
-  int row_offsets[8] = {0,3,6,9,11,14,15,15};
-  int col_indices[15] = {1,2,3,0,2,4,3,4,5,5,6,2,5,6,6};
-
-  // build graph as input
-  struct GunrockGraph *graph_input =
-    (struct GunrockGraph*)malloc(sizeof(struct GunrockGraph));
-  graph_input->num_nodes   = num_nodes;
-  graph_input->num_edges   = num_edges;
-  graph_input->row_offsets = (void*)&row_offsets[0];
-  graph_input->col_indices = (void*)&col_indices[0];
-
-  // malloc output graph
-  struct GunrockGraph *graph_output =
-    (struct GunrockGraph*)malloc(sizeof(struct GunrockGraph));
-  unsigned int *components = (unsigned int*)malloc(sizeof(unsigned int));
-
-  // run connected component calculations
-  gunrock_cc_func(
-    graph_output,
-    components,
-    graph_input,
-    configs,
-    data_type);
-
-  // test print
-  int i;
-  printf("Number of Components: %d\n", components[0]);
-  printf("Demo Outputs:\n");
-  int *component_ids = (int*)malloc(sizeof(int) * graph_input->num_nodes);
-  component_ids = (int*)graph_output->node_values;
-  for (i = 0; i < graph_input->num_nodes; ++i)
-  {
-    printf("Node_ID [%d] : Component_ID [%d]\n", i, component_ids[i]);
-  }
-
-  if (graph_input)  { free(graph_input);  }
-  if (graph_output) { free(graph_output); }
-
-  return 0;
+int main(int argc, char* argv[]) {
+    // define data types
+    struct GRTypes data_t;
+    data_t.VTXID_TYPE = VTXID_INT;
+    data_t.SIZET_TYPE = SIZET_INT;
+    data_t.VALUE_TYPE = VALUE_INT;
+
+    // connected component configurations
+    struct GRSetup config;
+    config.device = 0;
+
+    // define graph
+    size_t num_nodes    = 7;
+    size_t num_edges    = 15;
+    int row_offsets[8]  = {0, 3, 6, 9, 11, 14, 15, 15};
+    int col_indices[15] = {1, 2, 3, 0, 2, 4, 3, 4, 5, 5, 6, 2, 5, 6, 6};
+
+    // build graph as input
+    struct GRGraph *graph_i = (struct GRGraph*)malloc(sizeof(struct GRGraph));
+    graph_i->num_nodes   = num_nodes;
+    graph_i->num_edges   = num_edges;
+    graph_i->row_offsets = (void*)&row_offsets[0];
+    graph_i->col_indices = (void*)&col_indices[0];
+
+    // malloc output graph
+    struct GRGraph *graph_o = (struct GRGraph*)malloc(sizeof(struct GRGraph));
+    unsigned int *components = (unsigned int*)malloc(sizeof(unsigned int));
+
+    // run connected component calculations
+    gunrock_cc(graph_o, components, graph_i, config, data_t);
+
+    // demo test print
+    printf("Number of Components: %d\n", components[0]);
+    printf("Demo Outputs:\n");
+    int *component_ids = (int*)malloc(sizeof(int) * graph_i->num_nodes);
+    component_ids = (int*)graph_o->node_values;
+    int node;
+    for (node = 0; node < graph_i->num_nodes; ++node) {
+        printf("Node_ID [%d] : Component_ID [%d]\n", node, component_ids[node]);
+    }
+
+    // clean up
+    if (graph_i) { free(graph_i); }
+    if (graph_o) { free(graph_o); }
+
+    return 0;
 }
diff --git a/shared_lib_tests/test_mst.c b/shared_lib_tests/test_mst.c
new file mode 100644
index 000000000..07fbdb11c
--- /dev/null
+++ b/shared_lib_tests/test_mst.c
@@ -0,0 +1,57 @@
+/**
+ * @brief MST test for shared library
+ * @file test_mst.c
+ */
+
+#include <stdio.h>
+#include <gunrock/gunrock.h>
+
+int main(int argc, char* argv[]) {
+    // set problem data types
+    struct GRTypes data_t;
+    data_t.VTXID_TYPE = VTXID_INT;
+    data_t.VALUE_TYPE = VALUE_INT;
+    data_t.SIZET_TYPE = SIZET_INT;
+
+    // configurations (optional)
+    struct GRSetup config;
+    config.device = 0;
+
+    // tiny sample graph
+    size_t num_nodes = 7;
+    size_t num_edges = 26;
+    int row_offsets[8]  = {0, 3, 6, 11, 15, 19, 23, 26};
+    int col_indices[26] = {1, 2, 3, 0, 2, 4, 0, 1, 3, 4, 5, 0, 2,
+                           5, 6, 1, 2, 5, 6, 2, 3, 4, 6, 3, 4, 5};
+    int edge_values[26] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                           1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
+
+    // build an graph as input
+    struct GRGraph *graph_i = (struct GRGraph*)malloc(sizeof(struct GRGraph));
+    graph_i->num_nodes   = num_nodes;
+    graph_i->num_edges   = num_edges;
+    graph_i->row_offsets = (void*)&row_offsets[0];
+    graph_i->col_indices = (void*)&col_indices[0];
+    graph_i->edge_values = (void*)&edge_values[0];
+
+    // malloc output graph
+    struct GRGraph *graph_o = (struct GRGraph*)malloc(sizeof(struct GRGraph));
+
+    // call minimum spanning tree
+    gunrock_mst(graph_o, graph_i, config, data_t);
+
+    // demo test print
+    printf("Demo Outputs:\n");
+    int *mst_mask = (int*)malloc(sizeof(int) * num_edges);
+    mst_mask = (int*)graph_o->edge_values;
+    int edge;
+    for (edge = 0; edge < num_edges; ++edge) {
+        printf("Edge ID [%d] : Mask [%d]\n", edge, mst_mask[edge]);
+    }
+
+    // clean up
+    if (graph_i) { free(graph_i); }
+    if (graph_o) { free(graph_o); }
+
+    return 0;
+}
diff --git a/shared_lib_tests/test_pr.c b/shared_lib_tests/test_pr.c
index 0b8ceae0c..cb36b4df1 100644
--- a/shared_lib_tests/test_pr.c
+++ b/shared_lib_tests/test_pr.c
@@ -1,74 +1,62 @@
 /**
  * @brief PR test for shared library
  * @file test_pr.c
- *
- * set input graph, configs and call function gunrock_pr_func
- * return per node or per edge values in graph_out node_values
  */
 
 #include <stdio.h>
 #include <gunrock/gunrock.h>
 
-int main(int argc, char* argv[])
-{
-  // define data types
-  struct GunrockDataType data_type;
-  data_type.VTXID_TYPE = VTXID_INT;   //!< integer type vertex_ids
-  data_type.SIZET_TYPE = SIZET_INT;   //!< integer type graph size
-  data_type.VALUE_TYPE = VALUE_FLOAT; //!< float type value for pr
-
-  // pr configurations (optional)
-  struct GunrockConfig pr_config;
-  pr_config.device    =     0; //!< use device 0
-  pr_config.delta     = 0.85f; //!< default delta value
-  pr_config.error     = 0.01f; //!< default error threshold
-  pr_config.max_iter  =    20; //!< maximum number of iterations
-  pr_config.top_nodes =    10; //!< number of top nodes
-  pr_config.src_node  =     0; //!< source node to begin page rank
-  pr_config.src_mode  = manually; //!< set source node manually
-
-  // define graph (undirected graph)
-  size_t num_nodes = 7;
-  size_t num_edges = 15;
-  int row_offsets[8] = {0,3,6,9,11,14,15,15};
-  int col_indices[15] = {1,2,3,0,2,4,3,4,5,5,6,2,5,6,6};
-
-  // build graph as input
-  struct GunrockGraph *graph_input =
-    (struct GunrockGraph*)malloc(sizeof(struct GunrockGraph));
-  graph_input->num_nodes   = num_nodes;
-  graph_input->num_edges   = num_edges;
-  graph_input->row_offsets = (void*)&row_offsets[0];
-  graph_input->col_indices = (void*)&col_indices[0];
-
-  // malloc output graph
-  struct GunrockGraph *graph_output =
-    (struct GunrockGraph*)malloc(sizeof(struct GunrockGraph));
-  int   *node_ids  = (int*)malloc(sizeof(int) * pr_config.top_nodes);
-  float *page_rank = (float*)malloc(sizeof(float) * pr_config.top_nodes);
-
-  // run pr calculations
-  gunrock_pr_func(
-    graph_output,
-    node_ids,
-    page_rank,
-    graph_input,
-    pr_config,
-    data_type);
-
-  // test print
-  int i;
-  printf("Demo Outputs:\n");
-  if (pr_config.top_nodes > num_nodes) pr_config.top_nodes = num_nodes;
-  for (i = 0; i < pr_config.top_nodes; ++i)
-  {
-    printf("Node ID [%d] : Page Rank [%f] \n", node_ids[i], page_rank[i]);
-  }
-
-  if (node_ids)     { free(node_ids);     }
-  if (page_rank)    { free(page_rank);    }
-  if (graph_input)  { free(graph_input);  }
-  if (graph_output) { free(graph_output); }
-
-  return 0;
+int main(int argc, char* argv[]) {
+    // define data types
+    struct GRTypes data_t;
+    data_t.VTXID_TYPE = VTXID_INT;    // integer type vertex_ids
+    data_t.SIZET_TYPE = SIZET_INT;    // integer type graph size
+    data_t.VALUE_TYPE = VALUE_FLOAT;  // float type value for pr
+
+    // pr configurations (optional)
+    struct GRSetup config;
+    config.device    =     0;  // use device 0
+    config.delta     = 0.85f;  // default delta value
+    config.error     = 0.01f;  // default error threshold
+    config.max_iter  =    20;  // maximum number of iterations
+    config.top_nodes =    10;  // number of top nodes
+    config.src_node  =     0;  // source node to begin page rank
+    config.src_mode  = manually;  // set source node manually
+
+    // define graph (undirected graph)
+    size_t num_nodes    = 7;
+    size_t num_edges    = 15;
+    int row_offsets[8]  = {0, 3, 6, 9, 11, 14, 15, 15};
+    int col_indices[15] = {1, 2, 3, 0, 2, 4, 3, 4, 5, 5, 6, 2, 5, 6, 6};
+
+    // build graph as input
+    struct GRGraph *graph_i = (struct GRGraph*)malloc(sizeof(struct GRGraph));
+    graph_i->num_nodes   = num_nodes;
+    graph_i->num_edges   = num_edges;
+    graph_i->row_offsets = (void*)&row_offsets[0];
+    graph_i->col_indices = (void*)&col_indices[0];
+
+    // malloc output graph
+    struct GRGraph *graph_o = (struct GRGraph*)malloc(sizeof(struct GRGraph));
+    int   *node_ids  = (int*)malloc(sizeof(int) * config.top_nodes);
+    float *pagerank = (float*)malloc(sizeof(float) * config.top_nodes);
+
+    // run pr calculations
+    gunrock_pagerank(graph_o, node_ids, pagerank, graph_i, config, data_t);
+
+    // test print
+    int i;
+    printf("Demo Outputs:\n");
+    if (config.top_nodes > num_nodes) config.top_nodes = num_nodes;
+    for (i = 0; i < config.top_nodes; ++i) {
+        printf("Node ID [%d] : Page Rank [%f] \n", node_ids[i], pagerank[i]);
+    }
+
+    // clean up
+    if (node_ids) { free(node_ids); }
+    if (pagerank) { free(pagerank); }
+    if (graph_i)  { free(graph_i);  }
+    if (graph_o)  { free(graph_o);  }
+
+    return 0;
 }
diff --git a/shared_lib_tests/test_sssp.c b/shared_lib_tests/test_sssp.c
index f4fc0fe5b..e22370a3d 100644
--- a/shared_lib_tests/test_sssp.c
+++ b/shared_lib_tests/test_sssp.c
@@ -1,75 +1,63 @@
 /**
  * @brief SSSP test for shared library
  * @file test_sssp.c
- *
- * set input graph, configs and call function gunrock_sssp_func
- * return per node or per edge values in graph_out node_values
  */
 
 #include <stdio.h>
 #include <gunrock/gunrock.h>
 
-int main(int argc, char* argv[])
-{
-  // define data types
-  struct GunrockDataType data_type;
-  data_type.VTXID_TYPE = VTXID_INT;
-  data_type.SIZET_TYPE = SIZET_INT;
-  data_type.VALUE_TYPE = VALUE_UINT;
-
-  // pr configurations (optional)
-  struct GunrockConfig sssp_config;
-  sssp_config.device       =    0;
-  sssp_config.mark_pred    = true;
-  sssp_config.queue_size   = 1.0f;
-  sssp_config.delta_factor =    1;
-  sssp_config.src_mode     = randomize;
-  //sssp_config.src_node     =    1;
-
-  // define graph
-  size_t num_nodes = 7;
-  size_t num_edges = 15;
-
-  int row_offsets[8]           = {0,3,6,9,11,14,15,15};
-  int col_indices[15]          = {1,2,3,0,2,4,3,4,5,5,6,2,5,6,6};
-  unsigned int edge_values[15] = {39,6,41,51,63,17,10,44,41,13,58,43,50,59,35};
-
-  // build graph as input
-  struct GunrockGraph *graph_input =
-    (struct GunrockGraph*)malloc(sizeof(struct GunrockGraph));
-  graph_input->num_nodes   = num_nodes;
-  graph_input->num_edges   = num_edges;
-  graph_input->row_offsets = (void*)&row_offsets[0];
-  graph_input->col_indices = (void*)&col_indices[0];
-  graph_input->edge_values = (void*)&edge_values[0];
-
-  // malloc output graph
-  struct GunrockGraph *graph_output =
-    (struct GunrockGraph*)malloc(sizeof(struct GunrockGraph));
-  int *predecessor = (int*)malloc(sizeof(int) * num_nodes);
-
-  // run sssp calculations
-  gunrock_sssp_func(
-    graph_output,
-    predecessor,
-    graph_input,
-    sssp_config,
-    data_type);
-
-  // test print
-  int i;
-  printf("Demo Outputs:\n");
-  int *label = (int*)malloc(sizeof(int) * num_nodes);
-  label = (int*)graph_output->node_values;
-  for (i = 0; i < num_nodes; ++i)
-  {
-    printf("Node ID [%d] : Label [%d] : Predecessor [%d]\n",
-           i, label[i], predecessor[i]);
-  }
-
-  if (predecessor)  { free(predecessor);  }
-  if (graph_input)  { free(graph_input);  }
-  if (graph_output) { free(graph_output); }
-
-  return 0;
+int main(int argc, char* argv[]) {
+    // define data types
+    struct GRTypes data_t;
+    data_t.VTXID_TYPE = VTXID_INT;
+    data_t.SIZET_TYPE = SIZET_INT;
+    data_t.VALUE_TYPE = VALUE_UINT;
+
+    // configurations (optional)
+    struct GRSetup config;
+    config.device       =    0;
+    config.mark_pred    = true;
+    config.queue_size   = 1.0f;
+    config.delta_factor =    1;
+    config.src_mode     = randomize;
+
+    // define graph
+    size_t num_nodes = 7;
+    size_t num_edges = 15;
+
+    int row_offsets[8]           = {0, 3, 6, 9, 11, 14, 15, 15};
+    int col_indices[15]          = {1, 2, 3, 0, 2, 4, 3, 4, 5, 5, 6, 2, 5, 6, 6};
+    unsigned int edge_values[15] = {39, 6, 41, 51, 63, 17, 10, 44, 41, 13, 58, 43, 50, 59, 35};
+
+    // build graph as input
+    struct GRGraph *graph_i = (struct GRGraph*)malloc(sizeof(struct GRGraph));
+    graph_i->num_nodes   = num_nodes;
+    graph_i->num_edges   = num_edges;
+    graph_i->row_offsets = (void*)&row_offsets[0];
+    graph_i->col_indices = (void*)&col_indices[0];
+    graph_i->edge_values = (void*)&edge_values[0];
+
+    // malloc output graph
+    struct GRGraph *graph_o = (struct GRGraph*)malloc(sizeof(struct GRGraph));
+    int *predecessor = (int*)malloc(sizeof(int) * num_nodes);
+
+    // run calculations
+    gunrock_sssp(graph_o, predecessor, graph_i, config, data_t);
+
+    // demo test print
+    printf("Demo Outputs:\n");
+    int *label = (int*)malloc(sizeof(int) * num_nodes);
+    label = (int*)graph_o->node_values;
+    int node;
+    for (node = 0; node < num_nodes; ++node) {
+        printf("Node ID [%d] : Label [%d] : Predecessor [%d]\n",
+               node, label[node], predecessor[node]);
+    }
+
+    // clean up
+    if (predecessor) { free(predecessor); }
+    if (graph_i) { free(graph_i); }
+    if (graph_o) { free(graph_o); }
+
+    return 0;
 }
diff --git a/shared_lib_tests/test_topk.c b/shared_lib_tests/test_topk.c
index 1feea5e97..416fe9f7c 100644
--- a/shared_lib_tests/test_topk.c
+++ b/shared_lib_tests/test_topk.c
@@ -1,68 +1,65 @@
+/**
+ * @brief Top K test for shared library
+ * @file test_topk.c
+ */
+
 #include <gunrock/gunrock.h>
 #include <stdio.h>
 
-int main(int argc, char* argv[])
-{
-  // define data types
-  struct GunrockDataType data_type;
-  data_type.VTXID_TYPE = VTXID_INT;
-  data_type.SIZET_TYPE = SIZET_INT;
-  data_type.VALUE_TYPE = VALUE_INT;
+int main(int argc, char* argv[]) {
+    // define data types
+    struct GRTypes data_t;
+    data_t.VTXID_TYPE = VTXID_INT;
+    data_t.SIZET_TYPE = SIZET_INT;
+    data_t.VALUE_TYPE = VALUE_INT;
+
+    struct GRSetup config;
+    config.device    = 0;
+    config.top_nodes = 3;
 
-  struct GunrockConfig topk_config;
-  topk_config.device    = 0;
-  topk_config.top_nodes = 3;
+    // define graph (directed, reversed and non-reversed)
+    size_t num_nodes = 7;
+    size_t num_edges = 15;
 
-  // define graph (directed, reversed and non-reversed)
-  size_t num_nodes = 7;
-  size_t num_edges = 15;
+    int row_offsets[8]  = {0, 3, 6, 9, 11, 14, 15, 15};
+    int col_indices[15] = {1, 2, 3, 0, 2, 4, 3, 4, 5, 5, 6, 2, 5, 6, 6};
 
-  int row_offsets[8] = {0,3,6,9,11,14,15,15};
-  int col_indices[15] = {1,2,3,0,2,4,3,4,5,5,6,2,5,6,6};
+    int col_offsets[8]  = {0, 1, 2, 5, 7, 9, 12, 15};
+    int row_indices[15] = {1, 0, 0, 1, 4, 0, 2, 1, 2, 2, 3, 4, 3, 4, 5};
 
-  int col_offsets[8] = {0,1,2,5,7,9,12,15};
-  int row_indices[15] = {1,0,0,1,4,0,2,1,2,2,3,4,3,4,5};
+    // build graph as input
+    struct GRGraph *graph_i = (struct GRGraph*)malloc(sizeof(struct GRGraph));
+    graph_i->num_nodes = num_nodes;
+    graph_i->num_edges = num_edges;
+    graph_i->row_offsets = (void*)&row_offsets[0];
+    graph_i->col_indices = (void*)&col_indices[0];
+    graph_i->col_offsets = (void*)&col_offsets[0];
+    graph_i->row_indices = (void*)&row_indices[0];
 
-  // build graph as input
-  struct GunrockGraph *graph_input =
-    (struct GunrockGraph*)malloc(sizeof(struct GunrockGraph));
-  graph_input->num_nodes = num_nodes;
-  graph_input->num_edges = num_edges;
-  graph_input->row_offsets = (void*)&row_offsets[0];
-  graph_input->col_indices = (void*)&col_indices[0];
-  graph_input->col_offsets = (void*)&col_offsets[0];
-  graph_input->row_indices = (void*)&row_indices[0];
+    // malloc output result arrays
+    struct GRGraph *graph_o = (struct GRGraph*)malloc(sizeof(struct GRGraph));
+    int *node_ids    = (int*)malloc(sizeof(int) * config.top_nodes);
+    int *in_degrees  = (int*)malloc(sizeof(int) * config.top_nodes);
+    int *out_degrees = (int*)malloc(sizeof(int) * config.top_nodes);
 
-  // malloc output result arrays
-  struct GunrockGraph *graph_output =
-    (struct GunrockGraph*)malloc(sizeof(struct GunrockGraph));
-  int *node_ids    = (int*)malloc(sizeof(int) * topk_config.top_nodes);
-  int *in_degrees  = (int*)malloc(sizeof(int) * topk_config.top_nodes);
-  int *out_degrees = (int*)malloc(sizeof(int) * topk_config.top_nodes);
+    // run topk calculations
+    gunrock_topk(
+        graph_o, node_ids, in_degrees, out_degrees, graph_i, config, data_t);
 
-  // run topk calculations
-  gunrock_topk_func(
-    graph_output,
-    node_ids,
-    in_degrees,
-    out_degrees,
-    graph_input,
-    topk_config,
-    data_type);
+    // print results for check correctness
+    printf("Demo Outputs:\n");
+    int node;
+    for (node = 0; node < config.top_nodes; ++node) {
+        printf("Node ID [%d] : in_degrees [%d] : out_degrees [%d] \n",
+               node_ids[node], in_degrees[node], out_degrees[node]);
+    }
 
-  // print results for check correctness
-  int i;
-  printf("Demo Outputs:\n");
-  for (i = 0; i < topk_config.top_nodes; ++i)
-  {
-    printf("Node ID [%d] : in_degrees [%d] : out_degrees [%d] \n",
-      node_ids[i], in_degrees[i], out_degrees[i]);
-  }
+    // clean up
+    if (in_degrees)  free(in_degrees);
+    if (out_degrees) free(out_degrees);
+    if (node_ids)    free(node_ids);
+    if (graph_i)     free(graph_i);
+    if (graph_o)     free(graph_o);
 
-  if (in_degrees)   free(in_degrees);
-  if (out_degrees)  free(out_degrees);
-  if (node_ids)     free(node_ids);
-  if (graph_input)  free(graph_input);
-  if (graph_output) free(graph_output);
-  return 0;
+    return 0;
 }
\ No newline at end of file
diff --git a/simple_example/Makefile b/simple_example/Makefile
index 9762bce03..5fade4d66 100644
--- a/simple_example/Makefile
+++ b/simple_example/Makefile
@@ -94,7 +94,7 @@ else
 	ARCH = -m64
 endif
 
-NVCCFLAGS = -Xptxas -v -Xcudafe -\#
+NVCCFLAGS = -Xptxas -v -Xcudafe -\# -lineinfo
 
 ifeq (WIN_NT, $(findstring WIN_NT, $(OSUPPER)))
 	NVCCFLAGS += -Xcompiler /bigobj -Xcompiler /Zm500
@@ -123,15 +123,15 @@ endif
 # Dependency Lists
 #-------------------------------------------------------------------------------
 
-DEPS =			./Makefile \
-				$(wildcard ../gunrock/util/*.cuh) \
-				$(wildcard ../gunrock/util/**/*.cuh) \
-				$(wildcard ../gunrock/*.cuh) \
-				$(wildcard ../gunrock/graphio/*.cuh) \
-				$(wildcard ../gunrock/oprtr/*.cuh) \
-				$(wildcard ../gunrock/oprtr/**/*.cuh) \
-				$(wildcard ../gunrock/app/*.cuh) \
-				$(wildcard ../gunrock/app/**/*.cuh)
+DEPS = ./Makefile \
+	$(wildcard ../gunrock/util/*.cuh) \
+	$(wildcard ../gunrock/util/**/*.cuh) \
+	$(wildcard ../gunrock/*.cuh) \
+	$(wildcard ../gunrock/graphio/*.cuh) \
+	$(wildcard ../gunrock/oprtr/*.cuh) \
+	$(wildcard ../gunrock/oprtr/**/*.cuh) \
+	$(wildcard ../gunrock/app/*.cuh) \
+	$(wildcard ../gunrock/app/**/*.cuh)
 
 #-------------------------------------------------------------------------------
 # (make simple) Simple example driver for three primitives: CC, BFS and BC
@@ -139,9 +139,9 @@ DEPS =			./Makefile \
 
 simple: bin/simple_example_$(NVCC_VERSION)_$(ARCH_SUFFIX)
 
-bin/simple_example_$(NVCC_VERSION)_$(ARCH_SUFFIX) : simple_example.cu ../externals/moderngpu/src/mgpucontext.cu ../externals/moderngpu/src/mgpuutil.cpp $(DEPS)
+bin/simple_example_$(NVCC_VERSION)_$(ARCH_SUFFIX) : simple_example.cu cpu_graph_lib.cpp ../gunrock/util/error_utils.cu ../externals/moderngpu/src/mgpucontext.cu ../externals/moderngpu/src/mgpuutil.cpp $(DEPS)
 	mkdir -p bin
-	$(NVCC) $(DEFINES) $(SM_TARGETS) -o bin/simple_example_$(NVCC_VERSION)_$(ARCH_SUFFIX) simple_example.cu ../externals/moderngpu/src/mgpucontext.cu ../externals/moderngpu/src/mgpuutil.cpp $(NVCCFLAGS) $(ARCH) $(INC) -lcuda -O3
+	$(NVCC) $(DEFINES) $(SM_TARGETS) -o bin/simple_example_$(NVCC_VERSION)_$(ARCH_SUFFIX) simple_example.cu cpu_graph_lib.cpp ../gunrock/util/error_utils.cu ../externals/moderngpu/src/mgpucontext.cu ../externals/moderngpu/src/mgpuutil.cpp $(NVCCFLAGS) $(ARCH) $(INC) -lcuda -O3
 
 #-------------------------------------------------------------------------------
 # Clean
diff --git a/tests/bc/test_bc.cu b/tests/bc/test_bc.cu
index 91ceb9975..103aa4325 100644
--- a/tests/bc/test_bc.cu
+++ b/tests/bc/test_bc.cu
@@ -253,7 +253,7 @@ void RefCPUBC(
 
         for (idx = 0; idx < graph.edges; ++idx) {
             //std::cout << coo[idx].row << "," << coo[idx].col << ":" << coo[idx].val << std::endl;
-            ebc_values[idx] = coo[idx].val;
+            //ebc_values[idx] = coo[idx].val;
         }
 
         printf("CPU BC finished in %lf msec.", elapsed);
@@ -315,9 +315,12 @@ void RefCPUBC(
 
         for (int iter = search_depth - 2; iter > 0; --iter)
         {
+
+            int cur_level = 0;
             for (int node = 0; node < graph.nodes; ++node)
             {
                 if (source_path[node] == iter) {
+                    ++cur_level;
                     int edges_begin = graph.row_offsets[node];
                     int edges_end = graph.row_offsets[node+1];
 
@@ -334,9 +337,7 @@ void RefCPUBC(
         }
 
         for (int i = 0; i < graph.nodes; ++i)
-        {
             bc_values[i] *= 0.5f;
-        }
 
         cpu_timer.Stop();
         float elapsed = cpu_timer.ElapsedMillis();
@@ -847,6 +848,7 @@ int main( int argc, char** argv)
             return 1;
         }
 
+        csr.PrintHistogram();
 
     } else if (graph_type == "rmat")
     {   
diff --git a/tests/hits/CMakeLists.txt b/tests/hits/CMakeLists.txt
index 2ba54cb95..ef9e22ff5 100644
--- a/tests/hits/CMakeLists.txt
+++ b/tests/hits/CMakeLists.txt
@@ -12,7 +12,7 @@ set (mgpu_SOURCE_FILES
   ${mgpu_SOURCE_DIRS}/mgpucontext.cu
   ${mgpu_SOURCE_DIRS}/mgpuutil.cpp)
 
-CUDA_ADD_EXECUTABLE(hyperlink_induced_topic_search
+CUDA_ADD_EXECUTABLE(HITS
   test_hits.cu
   ${CMAKE_SOURCE_DIR}/gunrock/util/test_utils.cu
   ${CMAKE_SOURCE_DIR}/gunrock/util/error_utils.cu
diff --git a/tests/mst/run.sh b/tests/mst/run.sh
new file mode 100644
index 000000000..694a12fc0
--- /dev/null
+++ b/tests/mst/run.sh
@@ -0,0 +1,36 @@
+#!/bin/sh
+
+OPTION="--quick"
+
+# --quick running without CPU reference algorithm, if you want to test CPU
+# reference algorithm, delete $OPTION2 in some lines. Warning: for large
+# data this can take a long time.
+
+# get all execution files in ./bin
+files=(./bin/*)
+
+# split file names into arr
+arr=$(echo $files | tr " " "\n")
+max_ver_num="$"
+exe_file=${arr[0]}
+
+# iterate over all file names to get the largest version number
+for x in $arr
+do
+    output=$(grep -o "[0-9]\.[0-9]" <<<"$x")
+    if [ "$output" \> "$max_ver_num" ]; then
+        exe_file=$x
+    fi
+done
+
+# put OS and Device here
+SUFFIX="ubuntu12.04.k40c"
+
+mkdir -p eval/$SUFFIX
+
+for i in belgium_osm coAuthorsDBLP delaunay_n13 delaunay_n21
+do
+    echo $exe_file market ../../dataset/large/$i/$i.mtx $OPTION
+         $exe_file market ../../dataset/large/$i/$i.mtx $OPTION > eval/$SUFFIX/$i.$SUFFIX.txt
+    sleep 1
+done
diff --git a/tests/mst/test_mst.cu b/tests/mst/test_mst.cu
index 6083d9bd5..7c9eb5768 100644
--- a/tests/mst/test_mst.cu
+++ b/tests/mst/test_mst.cu
@@ -28,6 +28,7 @@
 #include <gunrock/graphio/market.cuh>
 
 // MST includes
+#include <gunrock/app/cc/cc_app.cu>
 #include <gunrock/app/mst/mst_enactor.cuh>
 #include <gunrock/app/mst/mst_problem.cuh>
 #include <gunrock/app/mst/mst_functor.cuh>
@@ -83,6 +84,11 @@ void Usage()
 /**
  * @brief Displays the MST result
  *
+ * @tparam VertexId
+ * @tparam Value
+ * @tparam SizeT
+ *
+ * @param[in] graph reference to the CSR graph we process on
  */
 ////////////////////////////////////////////////////////////////////////////////
 template<typename VertexId, typename Value, typename SizeT>
@@ -122,6 +128,25 @@ void DisplaySolution(const Csr<VertexId, Value, SizeT> &graph, int *mst_output)
   if (source) { delete [] source; }
 }
 
+/**
+ * @brief A simple connnectivity check utility
+ *
+ * @tparam VertexId
+ * @tparam Value
+ * @tparam SizeT
+ *
+ * @param[in] graph reference to the CSR graph we process on
+ */
+template<typename VertexId, typename Value, typename SizeT>
+bool IsConnected(const Csr<VertexId, Value, SizeT> & graph)
+{
+  GRGraph *temp = (GRGraph*)malloc(sizeof(GRGraph));
+  unsigned int *components = (unsigned int*)malloc(sizeof(unsigned int));
+  run_cc<VertexId, Value, SizeT>(temp, components, graph, 0, 1);
+  if (temp) free(temp);
+  return *components == 1;
+}
+
 /**
  * @brief A simple CPU-based reference MST implementation.
  *
@@ -136,18 +161,18 @@ void DisplaySolution(const Csr<VertexId, Value, SizeT> &graph, int *mst_output)
  */
 ////////////////////////////////////////////////////////////////////////////////
 template<typename VertexId, typename Value, typename SizeT>
-long long int SimpleReferenceMST(
+Value SimpleReferenceMST(
   const Value *edge_values, const Csr<VertexId, Value, SizeT> &graph)
 {
-  printf("\nREFERENCE TEST\n");
+  printf("\nMST CPU REFERENCE TEST\n");
 
   // Kruskal minimum spanning tree preparations
   using namespace boost;
-  typedef adjacency_list < vecS, vecS, undirectedS,
-    no_property, property < edge_weight_t, int > >  Graph;
+  typedef adjacency_list< vecS, vecS, undirectedS,
+    no_property, property<edge_weight_t, int> >   Graph;
   typedef graph_traits < Graph >::edge_descriptor   Edge;
   typedef graph_traits < Graph >::vertex_descriptor Vertex;
-  typedef std::pair<int, int> E;
+  typedef std::pair<VertexId, VertexId> E;
 
   E *edge_pairs = new E[graph.edges];
   int idx = 0;
@@ -165,16 +190,18 @@ long long int SimpleReferenceMST(
 
   CpuTimer cpu_timer; // record the kernel running time
   cpu_timer.Start();
+
   // compute reference using kruskal_min_spanning_tree algorithm
   kruskal_minimum_spanning_tree(g, std::back_inserter(spanning_tree));
+
   cpu_timer.Stop();
   float elapsed_cpu = cpu_timer.ElapsedMillis();
 
   // analyze reference results
-  SizeT         num_selected_cpu = 0;
-  long long int total_weight_cpu = 0;
+  SizeT num_selected_cpu = 0;
+  Value total_weight_cpu = 0;
 
-  if (graph.nodes <= 50) printf("CPU Minimum Spanning Tree\n");
+  if (graph.nodes <= 50) { printf("CPU Minimum Spanning Tree\n"); }
   for (std::vector < Edge >::iterator ei = spanning_tree.begin();
        ei != spanning_tree.end(); ++ei)
   {
@@ -182,7 +209,7 @@ long long int SimpleReferenceMST(
     {
       // print the edge pairs in the minimum spanning tree
       printf("%ld %ld\n", source(*ei, g), target(*ei, g));
-      // printf("  with weight of %d\n", weight[*ei]);
+      // printf("  with weight of %f\n", weight[*ei]);
     }
     ++num_selected_cpu;
     total_weight_cpu += weight[*ei];
@@ -214,10 +241,10 @@ long long int SimpleReferenceMST(
 ////////////////////////////////////////////////////////////////////////////////
 template <typename VertexId, typename Value, typename SizeT, bool INSTRUMENT>
 void RunTests(
-  const Csr<VertexId, Value, SizeT> &graph,
+  const Csr<VertexId, Value, SizeT> & graph,
   int max_grid_size,
   int num_gpus,
-  mgpu::CudaContext& context)
+  mgpu::CudaContext & context)
 {
   printf("\nMINIMUM SPANNING TREE TEST\n");
 
@@ -243,7 +270,7 @@ void RunTests(
     "MST Problem Data Reset Failed", __FILE__, __LINE__);
 
   // perform MST
-  GpuTimer gpu_timer; // record the kernel running time
+  GpuTimer gpu_timer;  // record the kernel running time
 
   gpu_timer.Start();
 
@@ -261,7 +288,7 @@ void RunTests(
   util::GRError(mst_problem->Extract(h_mst_output),
     "MST Problem Data Extraction Failed", __FILE__, __LINE__);
 
-  if (!g_quick) // run CPU reference test
+  if (!g_quick)  // run CPU reference test
   {
     // calculate GPU final number of selected edges
     int num_selected_gpu = 0;
@@ -272,27 +299,27 @@ void RunTests(
     // printf("\nGPU - Number of Edges in MST: %d\n", num_selected_gpu);
 
     // calculate GPU total selected MST weights for validation
-    long long int total_weight_gpu = 0;
+    Value total_weight_gpu = 0;
     for (int iter = 0; iter < graph.edges; ++iter)
     {
       total_weight_gpu += h_mst_output[iter] * graph.edge_values[iter];
     }
 
     // correctness validation
-    long long int total_weight_cpu =
-      SimpleReferenceMST(graph.edge_values, graph);
+    Value total_weight_cpu = SimpleReferenceMST(graph.edge_values, graph);
     if (total_weight_cpu == total_weight_gpu)
     {
       // print the edge pairs in the minimum spanning tree
       DisplaySolution(graph, h_mst_output);
       printf("\nCORRECT.\n");
+      std::cout << "CPU Total Weight = " << total_weight_cpu << std::endl;
+      std::cout << "GPU Total Weight = " << total_weight_gpu << std::endl;
     }
     else
     {
-      printf("INCORRECT. \n"
-             "CPU Computed Total Weight = %lld\n"
-             "GPU Computed Total Weight = %lld\n",
-             total_weight_cpu, total_weight_gpu);
+      printf("INCORRECT.\n");
+      std::cout << "CPU Total Weight = " << total_weight_cpu << std::endl;
+      std::cout << "GPU Total Weight = " << total_weight_gpu << std::endl;
     }
   }
 
@@ -316,17 +343,16 @@ void RunTests(
  */
 template <typename VertexId, typename Value, typename SizeT>
 void RunTests(
-  const Csr<VertexId, Value, SizeT> &graph,
-  CommandLineArgs                   &args,
-  mgpu::CudaContext&                context)
+  const Csr<VertexId, Value, SizeT> & graph,
+  CommandLineArgs                   & args,
+  mgpu::CudaContext                 & context)
 {
-  bool instrumented  = false; // do not collect instrumentation from kernels
-  int  max_grid_size = 0;     // maximum grid size (up to the enactor)
-  int  num_gpus      = 1;     // number of GPUs for multi-gpu enactor to use
-  g_quick            = false; // Whether or not to skip ref validation
+  bool instrumented  = 0;  // do not collect instrumentation from kernels
+  int  max_grid_size = 0;  // maximum grid size (up to the enactor)
+  int  num_gpus      = 1;  // number of GPUs for multi-gpu enactor to use
+  g_quick            = 0;  // Whether or not to skip ref validation
 
   instrumented = args.CheckCmdLineFlag("instrumented");
-
   g_quick = args.CheckCmdLineFlag("quick");
   g_verbose = args.CheckCmdLineFlag("v");
 
@@ -376,12 +402,12 @@ int main(int argc, char** argv)
 
   if (graph_type == "market")
   {
-
     // matrix-market coordinate-formatted graph file
 
-    typedef int VertexId; // use as the vertex identifier type
-    typedef int Value;    // use as the value type
-    typedef int SizeT;    // use as the graph size type
+    // currently support Value type: int, float, double
+    typedef int VertexId;  // use as the vertex identifier
+    typedef int Value;     // use as the value type
+    typedef int SizeT;     // use as the graph size
 
     // default value for stream_from_host is false
     if (graph_args < 1)
@@ -396,28 +422,28 @@ int main(int argc, char** argv)
     // template argument = true because the graph has edge values
     Csr<VertexId, Value, SizeT> csr(false);
     if (graphio::BuildMarketGraph<true>(
-      market_filename,
-      csr,
-      g_undirected,
-      false) != 0) { return 1; }
+      market_filename, csr, g_undirected, false) != 0) { return 1; }
 
-    // display graph
-    // csr.DisplayGraph();
+    // display input graph
+    // csr.DisplayGraph(true);
 
-    /***************************************************************
-    * To make sure two graphs have same weight value for each edge *
-    * we have to change ll_value = rand()%64 in market.cuh file to *
-    * some NON-RANDOM value if the original graph does NOT contain *
-    * weight per edge. Note it only support FULLY-CONNECTED graphs *
-    ***************************************************************/
-
-    // run GPU tests
-    RunTests(csr, args, *context);
+    /**************************************************************************
+     * Note: Minimum Spanning Tree only supports undirected, connected graphs *
+     **************************************************************************/
 
+    // test graph connectivity
+    if (IsConnected(csr))
+    {
+        RunTests(csr, args, *context);
+    }
+    else
+    {
+      fprintf(stderr, "Unsupported non-fully connected graph input.\n");
+    }
   }
   else
   {
-    fprintf(stderr, "Unspecified graph type\n");
+    fprintf(stderr, "Unspecified graph type.\n");
     return 1;
   }
 
@@ -428,4 +454,4 @@ int main(int argc, char** argv)
 // Local Variables:
 // mode:c++
 // c-file-style: "NVIDIA"
-// End:
+// End
diff --git a/tests/sssp/ppopp-test.sh b/tests/sssp/ppopp-test.sh
index cbc55562d..8934de90b 100644
--- a/tests/sssp/ppopp-test.sh
+++ b/tests/sssp/ppopp-test.sh
@@ -1,7 +1,7 @@
 mkdir -p eval/PPOPP15
 for i in  1-soc 2-bitcoin 3-kron 6-roadnet
 do
-    echo ./bin/test_sssp_6.5_x86_64 market /data/PPOPP15/$i.mtx --src=0 --undirected --iteration-num=10
-         ./bin/test_sssp_6.5_x86_64 market /data/PPOPP15/$i.mtx --src=0 --undirected --iteration-num=10 --delta-factor=32 > eval/PPOPP15/$i.txt
+    echo ./bin/test_sssp_7.0_x86_64 market /data/PPOPP15/$i.mtx --src=0 --undirected --iteration-num=10
+         ./bin/test_sssp_7.0_x86_64 market /data/PPOPP15/$i.mtx --src=0 --undirected --iteration-num=10 --delta-factor=32 > eval/PPOPP15/$i.txt
     sleep 1
 done
diff --git a/tests/sssp/test_sssp.cu b/tests/sssp/test_sssp.cu
index b9075bbc8..98663fbac 100644
--- a/tests/sssp/test_sssp.cu
+++ b/tests/sssp/test_sssp.cu
@@ -265,10 +265,10 @@ template<
     typename SizeT,
     bool     MARK_PREDECESSORS>
 void SimpleReferenceSssp(
-    const Csr<VertexId, Value, SizeT>       &graph,
-    Value                                   *node_values,
-    VertexId                                *node_preds,
-    VertexId                                src)
+    const Csr<VertexId, Value, SizeT> &graph,
+    Value                             *node_values,
+    VertexId                          *node_preds,
+    VertexId                          src)
 {
     using namespace boost;
 
@@ -279,11 +279,10 @@ void SimpleReferenceSssp(
     typedef graph_traits<Graph>::vertex_descriptor vertex_descriptor;
     typedef graph_traits<Graph>::edge_descriptor edge_descriptor;
 
-    typedef std::pair<unsigned int, unsigned int> Edge;
+    typedef std::pair<VertexId, VertexId> Edge;
 
-    Edge* edges = (Edge*)malloc(sizeof(Edge)*graph.edges);
-    unsigned int *weight =
-        (unsigned int*)malloc(sizeof(unsigned int)*graph.edges);
+    Edge   *edges = ( Edge*)malloc(sizeof( Edge)*graph.edges);
+    Value *weight = (Value*)malloc(sizeof(Value)*graph.edges);
 
     for (int i = 0; i < graph.nodes; ++i)
     {
@@ -296,7 +295,7 @@ void SimpleReferenceSssp(
 
     Graph g(edges, edges + graph.edges, weight, graph.nodes);
 
-    std::vector<unsigned int> d(graph.nodes);
+    std::vector<Value> d(graph.nodes);
     std::vector<vertex_descriptor> p(graph.nodes);
     vertex_descriptor s = vertex(src, g);
 
@@ -309,28 +308,30 @@ void SimpleReferenceSssp(
     CpuTimer cpu_timer;
     cpu_timer.Start();
 
-    if (MARK_PREDECESSORS)
-        dijkstra_shortest_paths(
-            g, s,
-            predecessor_map(boost::make_iterator_property_map(p.begin(), get(boost::vertex_index, g))).
-            distance_map(boost::make_iterator_property_map(d.begin(), get(boost::vertex_index, g))));
-    else
-        dijkstra_shortest_paths(
-            g, s,
-            distance_map(boost::make_iterator_property_map(d.begin(), get(boost::vertex_index, g))));
+    if (MARK_PREDECESSORS) {
+        dijkstra_shortest_paths(g, s,
+            predecessor_map(boost::make_iterator_property_map(
+                    p.begin(), get(boost::vertex_index, g))).distance_map(
+                        boost::make_iterator_property_map(
+                            d.begin(), get(boost::vertex_index, g))));
+    } else {
+        dijkstra_shortest_paths(g, s,
+            distance_map(boost::make_iterator_property_map(
+                    d.begin(), get(boost::vertex_index, g))));
+    }
     cpu_timer.Stop();
     float elapsed = cpu_timer.ElapsedMillis();
 
     printf("CPU SSSP finished in %lf msec.\n", elapsed);
 
-    Coo<unsigned int, unsigned int>* sort_dist = NULL;
-    Coo<unsigned int, unsigned int>* sort_pred = NULL;
-    sort_dist = (Coo<unsigned int, unsigned int>*)malloc(
-        sizeof(Coo<unsigned int, unsigned int>) * graph.nodes);
-    if (MARK_PREDECESSORS)
-        sort_pred = (Coo<unsigned int, unsigned int>*)malloc(
-            sizeof(Coo<unsigned int, unsigned int>) * graph.nodes);
-
+    Coo<Value, Value>* sort_dist = NULL;
+    Coo<VertexId, VertexId>* sort_pred = NULL;
+    sort_dist = (Coo<Value, Value>*)malloc(
+        sizeof(Coo<Value, Value>) * graph.nodes);
+    if (MARK_PREDECESSORS) {
+        sort_pred = (Coo<VertexId, VertexId>*)malloc(
+            sizeof(Coo<VertexId, VertexId>) * graph.nodes);
+    }
     graph_traits < Graph >::vertex_iterator vi, vend;
     for (tie(vi, vend) = vertices(g); vi != vend; ++vi)
     {
@@ -339,7 +340,7 @@ void SimpleReferenceSssp(
     }
     std::stable_sort(
         sort_dist, sort_dist + graph.nodes,
-        RowFirstTupleCompare<Coo<unsigned int, unsigned int> >);
+        RowFirstTupleCompare<Coo<Value, Value> >);
 
     if (MARK_PREDECESSORS)
     {
@@ -350,21 +351,21 @@ void SimpleReferenceSssp(
         }
         std::stable_sort(
             sort_pred, sort_pred + graph.nodes,
-            RowFirstTupleCompare<Coo<unsigned int, unsigned int> >);
+            RowFirstTupleCompare< Coo<VertexId, VertexId> >);
     }
 
     for (int i = 0; i < graph.nodes; ++i)
     {
         node_values[i] = sort_dist[i].col;
     }
-    if (MARK_PREDECESSORS)
+    if (MARK_PREDECESSORS) {
         for (int i = 0; i < graph.nodes; ++i)
         {
             node_preds[i] = sort_pred[i].col;
         }
-
-    free(sort_dist);
-    if (MARK_PREDECESSORS) free(sort_pred);
+    }
+    if (sort_dist) free(sort_dist);
+    if (sort_pred) free(sort_pred);
 }
 
 
@@ -687,6 +688,7 @@ void RunTests(
     parameter -> gpu_idx            = gpu_idx;
     parameter -> streams            = streams;
 
+    // source vertex to start
     args.GetCmdLineArgument("src", src_str);
     if (src_str.empty()) {
         parameter->src = 0;
@@ -704,7 +706,7 @@ void RunTests(
     args.GetCmdLineArgument("traversal-mode", parameter->traversal_mode);
     if (parameter->traversal_mode == -1)
     {
-        parameter->traversal_mode = graph->GetAverageDegree() > 8 ? 0 : 1;
+        parameter->traversal_mode = 0;
     }
 
     printf("src = %lld\n", parameter->src);
@@ -780,8 +782,7 @@ int main( int argc, char** argv)
     if (graph_args < 1) { Usage(); return 1; }
 
     if (graph_type == "market") {
-    // Matrix-market coordinate-formatted graph file
-
+        // Matrix-market coordinate-formatted graph file
         char *market_filename = (graph_args == 2) ? argv[2] : NULL;
         if (graphio::BuildMarketGraph<true>(
             market_filename, 
diff --git a/tests/vis/Makefile b/tests/vis/Makefile
new file mode 100644
index 000000000..7931cd948
--- /dev/null
+++ b/tests/vis/Makefile
@@ -0,0 +1,108 @@
+# -----------------------------------------------------------------------------
+# Gunrock -- High-Performance Graph Primitives on GPU
+# -----------------------------------------------------------------------------
+# This source code is distributed under the terms of LICENSE.TXT
+# in the root directory of this source distribution.
+# -----------------------------------------------------------------------------
+# Build script for project
+# -----------------------------------------------------------------------------
+
+force64 = 1
+NVCC = "$(shell which nvcc)"
+NVCC_VERSION = $(strip $(shell nvcc --version | grep release | sed 's/.*release //' | sed 's/,.*//'))
+
+KERNELS =
+
+# detect OS
+OSUPPER = $(shell uname -s 2>/dev/null | tr [:lower:] [:upper:])
+
+# -----------------------------------------------------------------------------
+# Gen targets
+# -----------------------------------------------------------------------------
+
+GEN_SM35 = -gencode=arch=compute_35,code=\"sm_35,compute_35\"
+GEN_SM30 = -gencode=arch=compute_30,code=\"sm_30,compute_30\"
+SM_TARGETS = $(GEN_SM35)
+
+# -----------------------------------------------------------------------------
+# Libs
+# -----------------------------------------------------------------------------
+
+
+# -----------------------------------------------------------------------------
+# Includes
+# -----------------------------------------------------------------------------
+
+CUDA_INC = "$(shell dirname $(NVCC))/../include"
+MGPU_INC = "../../externals/moderngpu/include"
+INC = -I$(CUDA_INC) -I$(MGPU_INC) -I.. -I../..
+
+# -----------------------------------------------------------------------------
+# Defines
+# -----------------------------------------------------------------------------
+
+DEFINES =
+
+# -----------------------------------------------------------------------------
+# Compiler Flags
+# -----------------------------------------------------------------------------
+
+ifneq ($(force64), 1)
+	# Compile with 32-bit device pointers by default
+	ARCH_SUFFIX = i386
+	ARCH = -m32
+else
+	ARCH_SUFFIX = x86_64
+	ARCH = -m64
+endif
+
+NVCCFLAGS = -Xcudafe -\#
+
+ifeq (WIN_NT, $(findstring WIN_NT, $(OSUPPER)))
+	NVCCFLAGS += -Xcompiler /bigobj -Xcompiler /Zm500
+endif
+
+
+ifeq ($(verbose), 1)
+	NVCCFLAGS += -v
+endif
+
+ifeq ($(keep), 1)
+	NVCCFLAGS += -keep
+endif
+
+ifdef maxregisters
+	NVCCFLAGS += -maxrregcount $(maxregisters)
+endif
+
+# -----------------------------------------------------------------------------
+# Dependency Lists
+# -----------------------------------------------------------------------------
+
+DEPS = ./Makefile \
+	$(wildcard ../../gunrock/util/*.cuh) \
+	$(wildcard ../../gunrock/util/**/*.cuh) \
+	$(wildcard ../../gunrock/*.cuh) \
+	$(wildcard ../../gunrock/graphio/*.cuh) \
+	$(wildcard ../../gunrock/oprtr/*.cuh) \
+	$(wildcard ../../gunrock/oprtr/**/*.cuh) \
+	$(wildcard ../../gunrock/app/*.cuh) \
+	$(wildcard ../../gunrock/app/**/*.cuh)
+
+# -----------------------------------------------------------------------------
+# (make test) Test driver for
+# -----------------------------------------------------------------------------
+
+test: bin/test_vis_$(NVCC_VERSION)_$(ARCH_SUFFIX)
+
+bin/test_vis_$(NVCC_VERSION)_$(ARCH_SUFFIX) : test_vis.cu ../../gunrock/util/test_utils.cu ../../gunrock/util/error_utils.cu ../../externals/moderngpu/src/mgpucontext.cu ../../externals/moderngpu/src/mgpuutil.cpp $(DEPS)
+	mkdir -p bin
+	$(NVCC) $(DEFINES) $(SM_TARGETS) -o bin/test_vis_$(NVCC_VERSION)_$(ARCH_SUFFIX) test_vis.cu ../../gunrock/util/test_utils.cu ../../gunrock/util/error_utils.cu ../../externals/moderngpu/src/mgpucontext.cu ../../externals/moderngpu/src/mgpuutil.cpp $(NVCCFLAGS) $(ARCH) $(INC) -O3
+
+# -----------------------------------------------------------------------------
+# Clean
+# -----------------------------------------------------------------------------
+
+clean :
+	rm -f bin/*_$(NVCC_VERSION)_$(ARCH_SUFFIX)*
+	rm -f *.i* *.cubin *.cu.c *.cudafe* *.fatbin.c *.ptx *.hash *.cu.cpp *.o
diff --git a/tests/vis/run.sh b/tests/vis/run.sh
new file mode 100644
index 000000000..708cedfec
--- /dev/null
+++ b/tests/vis/run.sh
@@ -0,0 +1,28 @@
+#!/bin/sh
+
+# get all execution files in ./bin
+files=(./bin/*)
+# split file names into arr
+arr=$(echo $files | tr " " "\n")
+max_ver_num="$"
+exe_file=${arr[0]}
+# iterate over all file names to get the largest version number
+for x in $arr
+do
+    output=$(grep -o "[0-9]\.[0-9]" <<<"$x")
+    if [ "$output" \> "$max_ver_num" ]; then
+        exe_file=$x
+    fi
+done
+
+# put OS and Device type here
+SUFFIX="ubuntu12.04.k40c"
+
+mkdir -p eval/$SUFFIX
+
+for i in test_bc
+do
+    echo $exe_file market ../../dataset/small/$i.mtx
+    $exe_file market ../../dataset/small/$i.mtx > eval/$SUFFIX/$i.$SUFFIX.txt
+    sleep 1
+done
diff --git a/tests/vis/test_vis.cu b/tests/vis/test_vis.cu
new file mode 100644
index 000000000..3584b9bff
--- /dev/null
+++ b/tests/vis/test_vis.cu
@@ -0,0 +1,346 @@
+// ----------------------------------------------------------------------------
+// Gunrock -- High-Performance Graph Primitives on GPU
+// ----------------------------------------------------------------------------
+// This source code is distributed under the terms of LICENSE.TXT
+// in the root directory of this source distribution.
+// ----------------------------------------------------------------------------
+
+/**
+ * @file test_vis.cuh
+ * @brief Simple test driver program for Vertex-Induced Subgraph
+ */
+
+#include <stdio.h>
+#include <string>
+#include <deque>
+#include <vector>
+#include <iostream>
+
+// utilities for correctness checking
+#include <gunrock/util/test_utils.cuh>
+
+// graph construction utilities
+#include <gunrock/graphio/market.cuh>
+
+// primitive-specific headers include
+#include <gunrock/app/vis/vis_enactor.cuh>
+#include <gunrock/app/vis/vis_problem.cuh>
+#include <gunrock/app/vis/vis_functor.cuh>
+
+// gunrock abstraction graph operators
+#include <gunrock/oprtr/advance/kernel.cuh>
+#include <gunrock/oprtr/filter/kernel.cuh>
+
+#include <moderngpu.cuh>
+
+using namespace gunrock;
+using namespace gunrock::util;
+using namespace gunrock::oprtr;
+using namespace gunrock::app::vis;
+
+// ----------------------------------------------------------------------------
+// Defines, constants, globals
+// ----------------------------------------------------------------------------
+
+bool g_verbose;
+bool g_undirected;
+bool g_quick;
+bool g_stream_from_host;
+
+// ----------------------------------------------------------------------------
+// Housekeeping Routines
+// ----------------------------------------------------------------------------
+void Usage() {
+    printf(
+        " test_vis <graph type> <graph type args> [--undirected] [--quick]\n"
+        " [--device=<device_index>] [--instrumented] [--iteration-num=<num>]\n"
+        " [--v] [--traversal-mode=<0|1>] [--queue-sizing=<scale factor>]\n"
+        "Graph types and arguments:\n"
+        "  market <file>\n"
+        "    Reads a Matrix-Market coordinate-formatted graph,\n"
+        "    edges from STDIN (or from the optionally-specified file)\n"
+        "  --device=<device_index>   Set GPU device to run. [Default: 0]\n"
+        "  --undirected              Convert the graph to undirected\n"
+        "  --instrumented            Keep kernels statics [Default: Disable]\n"
+        "                            total_queued, search_depth and avg_duty\n"
+        "                            (a relative indicator of load imbalance)\n"
+        "  --quick                   Skip the CPU validation [Default: false]\n"
+        "  --queue-sizing=<factor>   Allocates a frontier queue sized at: \n"
+        "                            (graph-edges * <factor>) [Default: 1.0]\n"
+        "  --v                       Print verbose per iteration debug info\n"
+        "  --iteration-num=<number>  Number of tests to run [Default: 1]\n"
+        "  --traversal-mode=<0 | 1>  Set strategy, 0 for Load-Balanced,\n"
+        "                            1 for Dynamic-Cooperative\n"
+        "                            [Default: according to topology]\n");
+}
+
+/**
+ * @brief Displays primitive result
+ *
+ * @tparam VertexId
+ * @tparam SizeT
+ * @tparam Value
+ */
+template<typename VertexId, typename SizeT, typename Value>
+void DisplaySolution(const Csr<VertexId, Value, SizeT> &graph) {
+    // TODO: code to print out results
+}
+
+
+/**
+ * @brief Performance / Evaluation statistics
+ */
+struct Stats {
+    const char *name;
+    Statistic num_iterations;
+    Stats() : name(NULL), num_iterations() {}
+    explicit Stats(const char *name) : name(name), num_iterations() {}
+};
+
+/**
+ * @brief Displays timing and correctness statistics
+ *
+ * @tparam VertexId
+ * @tparam SizeT
+ * @tparam Value
+ *
+ * @param[in] stats Reference to the Stats object
+ * @param[in] graph Reference to the CSR graph we process on
+ */
+template<typename VertexId, typename SizeT, typename Value>
+void DisplayStats(const Stats &stats, const Csr<VertexId, Value, SizeT> &graph,
+                  const float elapsed, const long long iterations) {
+    printf("[%s] finished.\n", stats.name);
+    printf("elapsed: %.4f ms\n", elapsed);
+}
+
+// ----------------------------------------------------------------------------
+// Testing Routines
+// ----------------------------------------------------------------------------
+
+/**
+ * @brief A simple CPU-based reference implementation.
+ *
+ * @tparam VertexId
+ * @tparam SizeT
+ * @tparam Value
+ *
+ * @param[in] graph Reference to the CSR graph we process on
+ */
+template<typename VertexId, typename SizeT, typename Value>
+void SimpleReference(const Csr<VertexId, Value, SizeT> &graph) {
+    // initialization
+
+    // perform calculation
+
+    CpuTimer cpu_timer;
+    cpu_timer.Start();
+
+    // TODO: CPU validation code here
+
+    cpu_timer.Stop();
+
+    float cpu_elapsed = cpu_timer.ElapsedMillis();
+    printf("CPU reference finished in %lf ms.\n\n", cpu_elapsed);
+}
+
+/**
+ * @brief Sample test
+ *
+ * @tparam VertexId
+ * @tparam SizeT
+ * @tparam Value
+ *
+ * @param[in] graph Reference to the CSR graph we process on
+ * @param[in] max_grid_size Maximum CTA occupancy
+ * @param[in] num_gpus Number of GPUs
+ * @param[in] max_queue_sizing Scaling factor used in edge mapping
+ * @param[in] iterations Number of iterations for running the test
+ * @param[in] traversal_mode Strategy: Load-balanced or Dynamic cooperative
+ * @param[in] context CudaContext pointer for ModernGPU APIs
+ *
+ */
+template<typename VertexId, typename SizeT, typename Value, bool INSTRUMENT>
+void RunTest(
+    const Csr<VertexId, Value, SizeT> &graph,
+    int          max_grid_size,
+    int          num_gpus,
+    double       max_queue_sizing,
+    int          iterations,
+    int          traversal_mode,
+    CudaContext& context) {
+    typedef VISProblem<VertexId, SizeT, Value> Problem;
+
+    // allocate host-side array (for both reference and GPU-computed results)
+    VertexId *r_labels = (VertexId*)malloc(sizeof(VertexId) * graph.nodes);
+    VertexId *h_labels = (VertexId*)malloc(sizeof(VertexId) * graph.nodes);
+
+    // allocate primitive enactor map
+    VISEnactor<INSTRUMENT> enactor(g_verbose);
+
+    // allocate primitive problem on GPU
+    Problem *csr_problem = new Problem;
+    util::GRError(csr_problem->Init(
+                      g_stream_from_host,
+                      graph,
+                      num_gpus),
+                  "Problem Initialization Failed", __FILE__, __LINE__);
+
+    Stats *stats = new Stats("Vertex-Induced Subgraph");
+
+    // perform calculation
+    GpuTimer gpu_timer;
+
+    float elapsed = 0.0f;
+
+    for (int iter = 0; iter < iterations; ++iter) {
+        util::GRError(
+            csr_problem->Reset(enactor.GetFrontierType(),
+                               max_queue_sizing),
+            "Problem Data Reset Failed", __FILE__, __LINE__);
+        gpu_timer.Start();
+        util::GRError(
+            enactor.template Enact<Problem>(context, csr_problem,
+                max_grid_size, traversal_mode),
+            "Problem Enact Failed", __FILE__, __LINE__);
+        gpu_timer.Stop();
+        elapsed += gpu_timer.ElapsedMillis();
+    }
+
+    elapsed /= iterations;
+
+    // extract results
+    util::GRError(csr_problem->Extract(h_labels),
+        "Problem Data Extraction Failed", __FILE__, __LINE__);
+
+    // compute reference CPU validation solution
+    if (!g_quick) {
+        printf("-- computing reference value ... (currently missing)\n");
+        SimpleReference<VertexId, SizeT, Value>(graph);
+        printf("-- validation: (currently missing)\n");
+    }
+
+    // display solution
+    DisplaySolution<VertexId, SizeT, Value>(graph);
+
+    // display statistics
+    VertexId num_iteratios = 0;
+    enactor.GetStatistics(num_iteratios);
+    DisplayStats<VertexId, SizeT, Value>(*stats, graph, elapsed, num_iteratios);
+
+    // clean up
+    delete stats;
+    if (csr_problem) delete csr_problem;
+    if (r_labels)    free(r_labels);
+    if (h_labels)    free(h_labels);
+
+    cudaDeviceSynchronize();
+}
+
+/**
+ * @brief Test entry
+ *
+ * @tparam VertexId
+ * @tparam SizeT
+ * @tparam Value
+ *
+ * @param[in] graph Reference to the CSR graph we process on
+ * @param[in] args Reference to the command line arguments
+ * @param[in] context CudaContext pointer for ModernGPU APIs
+ */
+template<typename VertexId, typename SizeT, typename Value>
+void RunTest(
+    Csr<VertexId, Value, SizeT> &graph,
+    CommandLineArgs &args,
+    CudaContext& context) {
+    bool   instrumented     =   0;  // Collect instrumentation from kernels
+    int    max_grid_size    =   0;  // Maximum grid size (0: up to the enactor)
+    int    num_gpus         =   1;  // Number of GPUs for multi-GPU enactor
+    double max_queue_sizing = 1.0;  // Maximum scaling factor for work queues
+    int    iterations       =   1;  // Number of runs for testing
+    int    traversal_mode   =  -1;  // Load-balanced or Dynamic cooperative
+    g_quick                 =   0;  // Whether or not to skip CPU validation
+
+    // choose traversal mode
+    args.GetCmdLineArgument("traversal-mode", traversal_mode);
+    if (traversal_mode == -1) {
+        traversal_mode = graph.GetAverageDegree() > 8 ? 0 : 1;
+    }
+
+    g_verbose    = args.CheckCmdLineFlag("v");
+    instrumented = args.CheckCmdLineFlag("instrumented");
+    g_quick = args.CheckCmdLineFlag("quick");
+
+    args.GetCmdLineArgument("iteration-num", iterations);
+    args.GetCmdLineArgument("grid-size", max_grid_size);
+    args.GetCmdLineArgument("queue-sizing", max_queue_sizing);
+
+    if (instrumented) {
+        RunTest<VertexId, Value, SizeT, true>(
+            graph,
+            max_grid_size,
+            num_gpus,
+            max_queue_sizing,
+            iterations,
+            traversal_mode,
+            context);
+    } else {
+        RunTest<VertexId, Value, SizeT, false>(
+            graph,
+            max_grid_size,
+            num_gpus,
+            max_queue_sizing,
+            iterations,
+            traversal_mode,
+            context);
+    }
+}
+
+// ----------------------------------------------------------------------------
+// Main
+// ----------------------------------------------------------------------------
+int main(int argc, char** argv) {
+    CommandLineArgs args(argc, argv);
+    if ((argc < 2) || (args.CheckCmdLineFlag("help"))) {
+        Usage();
+        return 1;
+    }
+
+    int device = 0;
+    args.GetCmdLineArgument("device", device);
+    ContextPtr context = mgpu::CreateCudaDevice(device);
+
+    // parse graph-construction parameters
+    g_undirected = args.CheckCmdLineFlag("undirected");
+
+    std::string graph_type = argv[1];
+    int flags = args.ParsedArgc();
+    int graph_args = argc - flags - 1;
+    if (graph_args < 1) {
+        Usage();
+        return 1;
+    }
+
+    typedef int VertexId;  // Use as the vertex identifier
+    typedef int SizeT;     // Use as the graph size type
+    typedef int Value;     // Use as the value type
+
+    if (graph_type == "market") {
+        // matrix-market coordinate-formatted graph
+        Csr<VertexId, Value, SizeT> csr(false);
+        char *name = (graph_args == 2) ? argv[2] : NULL;
+        if (graphio::BuildMarketGraph<false>(
+            name, csr, g_undirected, false) != 0) {
+            return 1;
+        }
+
+        csr.DisplayGraph();    // display graph adjacent list
+        csr.PrintHistogram();  // display graph histogram
+        RunTest(csr, args, *context);  // run sample test
+
+    } else {
+        fprintf(stderr, "Unspecified graph type\n");
+        return 1;
+    }
+    return 0;
+}