flexflow · elliottslaughter · Apr 9, 2026 · Apr 9, 2026 · Apr 15, 2026 · Apr 15, 2026
diff --git a/lib/op-attrs/include/op-attrs/pcg_operator_attrs.dtg.toml b/lib/op-attrs/include/op-attrs/pcg_operator_attrs.dtg.toml
@@ -11,13 +11,13 @@ features = [
 ]
 
 includes = [
-  "op-attrs/ops/attention_attrs.dtg.h", 
-  "op-attrs/ops/batch_matmul_attrs.dtg.h", 
-  "op-attrs/ops/batch_norm_attrs.dtg.h", 
-  "op-attrs/ops/broadcast_attrs.dtg.h", 
-  "op-attrs/ops/cast_attrs.dtg.h", 
-  "op-attrs/ops/combine_attrs.dtg.h", 
-  "op-attrs/ops/concat_attrs.dtg.h", 
+  "op-attrs/ops/attention_attrs.dtg.h",
+  "op-attrs/ops/batch_matmul_attrs.dtg.h",
+  "op-attrs/ops/batch_norm_attrs.dtg.h",
+  "op-attrs/ops/broadcast_attrs.dtg.h",
+  "op-attrs/ops/cast_attrs.dtg.h",
+  "op-attrs/ops/combine_attrs.dtg.h",
+  "op-attrs/ops/concat_attrs.dtg.h",
   "op-attrs/ops/conv_2d_attrs.dtg.h",
   "op-attrs/ops/dropout_attrs.dtg.h",
   "op-attrs/ops/element_binary_attrs.dtg.h",
@@ -61,7 +61,7 @@ key = "cast"
 
 [[values]]
 type = "::FlexFlow::CombineAttrs"
-key = "combine_distributed"
+key = "parallel_combine"
 
 [[values]]
 type = "::FlexFlow::ConcatAttrs"
@@ -125,15 +125,15 @@ key = "reduce"
 
 [[values]]
 type = "::FlexFlow::ReductionAttrs"
-key = "reduce_distributed"
+key = "parallel_reduce"
 
 [[values]]
 type = "::FlexFlow::RepartitionAttrs"
-key = "partition_distributed"
+key = "parallel_partition"
 
 [[values]]
 type = "::FlexFlow::ReplicateAttrs"
-key = "replicate_distributed"
+key = "parallel_replicate"
 
 [[values]]
 type = "::FlexFlow::ReverseAttrs"

diff --git a/lib/op-attrs/src/op-attrs/ops/element_unary.cc b/lib/op-attrs/src/op-attrs/ops/element_unary.cc
@@ -35,7 +35,6 @@ ParallelTensorDimDegrees get_output_parallel_dim_degrees(
     ElementUnaryAttrs const &attrs,
     ParallelTensorDimDegrees const &input_degrees) {
   ASSERT(input_degrees.sum_degree.value == 1);
-  ASSERT(input_degrees.discard_copy_degree.value == 1);
 
   return input_degrees;
 }

diff --git a/lib/op-attrs/test/src/op-attrs/ops/element_unary.cc b/lib/op-attrs/test/src/op-attrs/ops/element_unary.cc
@@ -53,22 +53,26 @@ TEST_SUITE(FF_TEST_SUITE) {
       CHECK(result == correct);
     }
 
-    SUBCASE("sum degree > 1") {
+    SUBCASE("discard copy degree > 1") {
       positive_int degree = 2_p;
 
-      CHECK_THROWS(get_output_shape(
-          attrs,
-          make_input(
-              SumDegree{degree}, DiscardCopyDegree{1_p}, 1_p, 1_p, 1_p)));
+      ParallelTensorShape par_input =
+          make_input(SumDegree{1_p}, DiscardCopyDegree{degree}, 1_p, 1_p, 1_p);
+
+      tl::expected<ParallelTensorShape, std::string> result =
+          get_output_shape(attrs, par_input);
+      tl::expected<ParallelTensorShape, std::string> correct = par_input;
+
+      CHECK(result == correct);
     }
 
-    SUBCASE("discard copy degree > 1") {
+    SUBCASE("sum degree > 1") {
       positive_int degree = 2_p;
 
       CHECK_THROWS(get_output_shape(
           attrs,
           make_input(
-              SumDegree{1_p}, DiscardCopyDegree{degree}, 1_p, 1_p, 1_p)));
+              SumDegree{degree}, DiscardCopyDegree{1_p}, 1_p, 1_p, 1_p)));
     }
   }
 }
diff --git a/lib/pcg/include/pcg/mapped_parallel_computation_graph/mapped_parallel_computation_graph.h b/lib/pcg/include/pcg/mapped_parallel_computation_graph/mapped_parallel_computation_graph.h
@@ -8,15 +8,47 @@ namespace FlexFlow {
 
 std::unordered_set<parallel_layer_guid_t>
     mpcg_get_parallel_layers(MappedParallelComputationGraph const &);
+
 MappedOperatorTaskGroup
     mpcg_get_mapping_for_layer(MappedParallelComputationGraph const &,
                                parallel_layer_guid_t);
 
 ParallelComputationGraph pcg_from_mpcg(MappedParallelComputationGraph const &);
 
+parallel_layer_guid_t
+    mpcg_get_source_layer(MappedParallelComputationGraph const &,
+                          parallel_tensor_guid_t const &);
+
+PCGOperatorAttrs mpcg_get_pcg_op_attrs(MappedParallelComputationGraph const &,
+                                       parallel_layer_guid_t const &);
+
+ParallelTensorAttrs
+    mpcg_get_parallel_tensor_attrs(MappedParallelComputationGraph const &,
+                                   parallel_tensor_guid_t const &);
+
+std::unordered_map<TensorSlotName, ParallelComputationGraphEdge>
+    mpcg_get_incoming_edges(MappedParallelComputationGraph const &,
+                            parallel_layer_guid_t const &);
+
+std::unordered_set<ParallelComputationGraphEdge>
+    mpcg_get_outgoing_edges(MappedParallelComputationGraph const &,
+                            parallel_layer_guid_t const &);
+
+ManyToOne<TensorSlotName, parallel_tensor_guid_t>
+    mpcg_get_incoming_tensors(MappedParallelComputationGraph const &,
+                              parallel_layer_guid_t const &);
+
+bidict<TensorSlotName, parallel_tensor_guid_t>
+    mpcg_get_outgoing_tensors(MappedParallelComputationGraph const &,
+                              parallel_layer_guid_t const &);
+
 std::unordered_set<ParallelComputationGraphEdge>
     mpcg_get_edges(MappedParallelComputationGraph const &);
 
+std::unordered_set<parallel_tensor_use_t>
+    mpcg_get_parallel_tensor_uses(MappedParallelComputationGraph const &,
+                                  parallel_tensor_guid_t const &);
+
 MappedParallelComputationGraph mapped_pcg_from_pcg_and_mapped_op_task_groups(
     ParallelComputationGraph const &pcg,
     std::unordered_map<parallel_layer_guid_t, MappedOperatorTaskGroup> const

diff --git a/lib/pcg/include/pcg/parallel_computation_graph/parallel_computation_graph.h b/lib/pcg/include/pcg/parallel_computation_graph/parallel_computation_graph.h
@@ -10,6 +10,7 @@
 #include "pcg/parallel_computation_graph/parallel_layer_added_result.dtg.h"
 #include "pcg/parallel_computation_graph/parallel_layer_guid_t.dtg.h"
 #include "pcg/parallel_computation_graph/parallel_tensor_guid_t.dtg.h"
+#include "pcg/parallel_computation_graph/parallel_tensor_use_t.dtg.h"
 #include <unordered_set>
 
 namespace FlexFlow {
@@ -53,6 +54,10 @@ std::unordered_map<TensorSlotName, ParallelComputationGraphEdge>
     get_incoming_edges(ParallelComputationGraph const &,
                        parallel_layer_guid_t const &);
 
+std::unordered_set<parallel_tensor_use_t>
+    pcg_get_parallel_tensor_uses(ParallelComputationGraph const &,
+                                 parallel_tensor_guid_t const &);
+
 std::unordered_set<parallel_layer_guid_t>
     get_initial_layers(ParallelComputationGraph const &);
 

diff --git a/lib/pcg/include/pcg/parallel_computation_graph/parallel_tensor_use_t.h b/lib/pcg/include/pcg/parallel_computation_graph/parallel_tensor_use_t.h
@@ -0,0 +1,15 @@
+#ifndef _FLEXFLOW_LIB_PCG_INCLUDE_PCG_PARALLEL_COMPUTATION_GRAPH_PARALLEL_TENSOR_USE_T_H
+#define _FLEXFLOW_LIB_PCG_INCLUDE_PCG_PARALLEL_COMPUTATION_GRAPH_PARALLEL_TENSOR_USE_T_H
+
+#include "pcg/parallel_computation_graph/parallel_layer_guid_t.dtg.h"
+#include "pcg/parallel_computation_graph/parallel_tensor_use_t.dtg.h"
+
+namespace FlexFlow {
+
+parallel_layer_guid_t
+    parallel_tensor_use_get_layer(parallel_tensor_use_t const &);
+TensorSlotName parallel_tensor_use_get_slot(parallel_tensor_use_t const &);
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/pcg/src/pcg/mapped_parallel_computation_graph/mapped_parallel_computation_graph.cc b/lib/pcg/src/pcg/mapped_parallel_computation_graph/mapped_parallel_computation_graph.cc
@@ -2,12 +2,14 @@
 #include "op-attrs/pcg_operator_attrs.h"
 #include "pcg/mapped_parallel_computation_graph/mapped_parallel_layer_attrs.h"
 #include "pcg/parallel_computation_graph/parallel_computation_graph.h"
+#include "utils/bidict/algorithms/bidict_from_map.h"
 #include "utils/bidict/algorithms/transform_keys.h"
 #include "utils/containers/transform.h"
 #include "utils/graph/kwarg_dataflow_graph/algorithms/find_isomorphism_between_kwarg_dataflow_graphs.h"
 #include "utils/graph/labelled_kwarg_dataflow_graph/algorithms/labelled_kwarg_dataflow_graph_view_as_dot.h"
 #include "utils/graph/labelled_kwarg_dataflow_graph/algorithms/materialize_labelled_kwarg_dataflow_graph_view.h"
 #include "utils/graph/labelled_kwarg_dataflow_graph/algorithms/rewrite_labelled_kwarg_dataflow_graph_node_labels.h"
+#include "utils/many_to_one/many_to_one_from_map.h"
 
 namespace FlexFlow {
 
@@ -46,6 +48,59 @@ ParallelComputationGraph
   };
 }
 
+parallel_layer_guid_t
+    mpcg_get_source_layer(MappedParallelComputationGraph const &mpcg,
+                          parallel_tensor_guid_t const &t) {
+  return get_source_layer(pcg_from_mpcg(mpcg), t);
+}
+
+PCGOperatorAttrs
+    mpcg_get_pcg_op_attrs(MappedParallelComputationGraph const &mpcg,
+                          parallel_layer_guid_t const &l) {
+  return pcg_get_op_attrs(pcg_from_mpcg(mpcg), l);
+}
+
+ParallelTensorAttrs
+    mpcg_get_parallel_tensor_attrs(MappedParallelComputationGraph const &mpcg,
+                                   parallel_tensor_guid_t const &t) {
+  return get_parallel_tensor_attrs(pcg_from_mpcg(mpcg), t);
+}
+
+std::unordered_map<TensorSlotName, ParallelComputationGraphEdge>
+    mpcg_get_incoming_edges(MappedParallelComputationGraph const &mpcg,
+                            parallel_layer_guid_t const &l) {
+  return get_incoming_edges(pcg_from_mpcg(mpcg), l);
+}
+
+std::unordered_set<ParallelComputationGraphEdge>
+    mpcg_get_outgoing_edges(MappedParallelComputationGraph const &mpcg,
+                            parallel_layer_guid_t const &l) {
+  return get_outgoing_edges(pcg_from_mpcg(mpcg), l);
+}
+
+ManyToOne<TensorSlotName, parallel_tensor_guid_t>
+    mpcg_get_incoming_tensors(MappedParallelComputationGraph const &mpcg,
+                              parallel_layer_guid_t const &l) {
+  return many_to_one_from_map(get_incoming_tensors(pcg_from_mpcg(mpcg), l));
+}
+
+bidict<TensorSlotName, parallel_tensor_guid_t>
+    mpcg_get_outgoing_tensors(MappedParallelComputationGraph const &mpcg,
+                              parallel_layer_guid_t const &l) {
+  return bidict_from_map(get_outgoing_tensors(pcg_from_mpcg(mpcg), l));
+}
+
+std::unordered_set<ParallelComputationGraphEdge>
+    mpcg_get_edges(MappedParallelComputationGraph const &mpcg) {
+  return get_edges(pcg_from_mpcg(mpcg));
+}
+
+std::unordered_set<parallel_tensor_use_t>
+    mpcg_get_parallel_tensor_uses(MappedParallelComputationGraph const &mpcg,
+                                  parallel_tensor_guid_t const &t) {
+  return pcg_get_parallel_tensor_uses(pcg_from_mpcg(mpcg), t);
+}
+
 MappedParallelComputationGraph mapped_pcg_from_pcg_and_mapped_op_task_groups(
     ParallelComputationGraph const &pcg,
     std::unordered_map<parallel_layer_guid_t, MappedOperatorTaskGroup> const

diff --git a/lib/pcg/src/pcg/parallel_computation_graph/parallel_computation_graph.cc b/lib/pcg/src/pcg/parallel_computation_graph/parallel_computation_graph.cc
@@ -28,6 +28,7 @@
 #include "utils/graph/kwarg_dataflow_graph/algorithms/find_isomorphism_between_kwarg_dataflow_graphs.h"
 #include "utils/graph/kwarg_dataflow_graph/algorithms/get_incoming_kwarg_dataflow_outputs_for_node.h"
 #include "utils/graph/kwarg_dataflow_graph/algorithms/get_kwarg_dataflow_edges_from_node_to_node.h"
+#include "utils/graph/kwarg_dataflow_graph/algorithms/get_kwarg_dataflow_value_uses.h"
 #include "utils/graph/kwarg_dataflow_graph/algorithms/get_outgoing_kwarg_dataflow_edges_for_node.h"
 #include "utils/graph/kwarg_dataflow_graph/algorithms/get_outgoing_kwarg_dataflow_outputs_for_node.h"
 #include "utils/graph/labelled_kwarg_dataflow_graph/algorithms/labelled_kwarg_dataflow_graph_view_as_dot.h"
@@ -206,6 +207,17 @@ std::unordered_map<TensorSlotName, ParallelComputationGraphEdge>
   });
 }
 
+std::unordered_set<parallel_tensor_use_t>
+    pcg_get_parallel_tensor_uses(ParallelComputationGraph const &pcg,
+                                 parallel_tensor_guid_t const &t) {
+  std::unordered_set<KwargDataflowInput<TensorSlotName>> raw_uses =
+      get_kwarg_dataflow_value_uses(pcg.raw_graph, t.raw_graph_output);
+
+  return transform(raw_uses, [](KwargDataflowInput<TensorSlotName> const &i) {
+    return parallel_tensor_use_t{i};
+  });
+}
+
 std::unordered_set<parallel_layer_guid_t>
     get_initial_layers(ParallelComputationGraph const &pcg) {
   std::unordered_set<Node> raw_sources = get_initial_nodes(pcg.raw_graph);

diff --git a/lib/pcg/src/pcg/parallel_computation_graph/parallel_tensor_use_t.cc b/lib/pcg/src/pcg/parallel_computation_graph/parallel_tensor_use_t.cc
@@ -0,0 +1,14 @@
+#include "pcg/parallel_computation_graph/parallel_tensor_use_t.h"
+
+namespace FlexFlow {
+
+parallel_layer_guid_t
+    parallel_tensor_use_get_layer(parallel_tensor_use_t const &u) {
+  return parallel_layer_guid_t{u.raw_dataflow_input.node};
+}
+
+TensorSlotName parallel_tensor_use_get_slot(parallel_tensor_use_t const &u) {
+  return u.raw_dataflow_input.slot_name;
+}
+
+} // namespace FlexFlow
diff --git a/lib/realm-execution/include/realm-execution/realm_context.h b/lib/realm-execution/include/realm-execution/realm_context.h
@@ -9,6 +9,7 @@
 #include "pcg/device_id_t.dtg.h"
 #include "pcg/machine_space_coordinate.dtg.h"
 #include "realm-execution/realm.h"
+#include "realm-execution/redops/redop_id_t.dtg.h"
 #include "realm-execution/tasks/task_id_t.dtg.h"
 #include <optional>
 #include <unordered_map>
@@ -63,7 +64,7 @@ struct RealmContext {
                             int priority = 0);
   ///\}
 
-  /** \name Data movement */
+  /** \name Data movement and reduction */
   ///\{
   Realm::Event issue_copy(ParallelTensorShape const &src_shape,
                           Realm::RegionInstance src_inst,
@@ -72,6 +73,17 @@ struct RealmContext {
                           Realm::ProfilingRequestSet const &requests,
                           Realm::Event wait_on = Realm::Event::NO_EVENT,
                           int priority = 0);
+
+  Realm::Event issue_reduction(ParallelTensorShape const &src_shape,
+                               Realm::RegionInstance src_inst,
+                               ParallelTensorShape const &dst_shape,
+                               Realm::RegionInstance dst_inst,
+                               redop_id_t redop_id,
+                               bool is_fold,
+                               bool exclusive,
+                               Realm::ProfilingRequestSet const &requests,
+                               Realm::Event wait_on = Realm::Event::NO_EVENT,
+                               int priority = 0);
   ///\}
 
   /** \name Instance management */

diff --git a/lib/realm-execution/include/realm-execution/redops/realm_redop_registry.h b/lib/realm-execution/include/realm-execution/redops/realm_redop_registry.h
@@ -0,0 +1,16 @@
+#ifndef _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_REDOPS_REALM_REDOP_REGISTRY_H
+#define _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_REDOPS_REALM_REDOP_REGISTRY_H
+
+#include "realm-execution/realm.h"
+#include "realm-execution/redops/redop_id_t.dtg.h"
+
+namespace FlexFlow {
+
+/**
+ * \brief Registers all known reduction operators (redops).
+ */
+void register_all_redops(Realm::Runtime);
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/realm-execution/include/realm-execution/redops/redop_id_t.dtg.toml b/lib/realm-execution/include/realm-execution/redops/redop_id_t.dtg.toml
@@ -0,0 +1,27 @@
+namespace = "FlexFlow"
+name = "redop_id_t"
+type = "enum"
+features = [
+  "hash",
+  "fmt",
+  "rapidcheck",
+  "json",
+]
+docstring = '''
+\brief An enum for identifying reduction operators (redops) for use in the Realm runtime.
+'''
+
+[[values]]
+name = "SUM_BOOL_REDOP_ID"
+
+[[values]]
+name = "SUM_INT32_REDOP_ID"
+
+[[values]]
+name = "SUM_INT64_REDOP_ID"
+
+[[values]]
+name = "SUM_FLOAT_REDOP_ID"
+
+[[values]]
+name = "SUM_DOUBLE_REDOP_ID"
diff --git a/lib/realm-execution/include/realm-execution/redops/redop_id_t.h b/lib/realm-execution/include/realm-execution/redops/redop_id_t.h
@@ -0,0 +1,22 @@
+#ifndef _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_REDOPS_REALM_REDOP_ID_T_H
+#define _FLEXFLOW_LIB_REALM_EXECUTION_INCLUDE_REALM_EXECUTION_REDOPS_REALM_REDOP_ID_T_H
+
+#include "op-attrs/datatype.dtg.h"
+#include "realm-execution/realm.h"
+#include "realm-execution/redops/redop_id_t.dtg.h"
+
+namespace FlexFlow {
+
+/**
+ * \brief Return the sum reduction operator (redop) ID for a given data type.
+ */
+redop_id_t get_sum_redop_id_for_data_type(DataType);
+
+/**
+   * \brief Convert a \ref FlexFlow::redop_id_t into a Realm reduction op ID.
+   */
+Realm::ReductionOpID get_realm_reduction_op_id_for_redop_id(redop_id_t);
+
+} // namespace FlexFlow
+
+#endif