From 35524b9c5b32ef9851c95060d0e2a52d43e16750 Mon Sep 17 00:00:00 2001 From: ShangkunLI Date: Thu, 5 Feb 2026 23:27:36 +0800 Subject: [PATCH 1/5] prototype affine loop perfection pass --- include/TaskflowDialect/TaskflowPasses.h | 1 + include/TaskflowDialect/TaskflowPasses.td | 13 + .../AffineLoopPerfectionPass.cpp | 267 ++++++++++++++++++ .../Transforms/Optimizations/CMakeLists.txt | 1 + test/benchmark/CGRA-Bench | 2 +- 5 files changed, 283 insertions(+), 1 deletion(-) create mode 100644 lib/TaskflowDialect/Transforms/Optimizations/AffineLoopPerfectionPass.cpp diff --git a/include/TaskflowDialect/TaskflowPasses.h b/include/TaskflowDialect/TaskflowPasses.h index c50544c9..71a3b510 100644 --- a/include/TaskflowDialect/TaskflowPasses.h +++ b/include/TaskflowDialect/TaskflowPasses.h @@ -23,6 +23,7 @@ std::unique_ptr createMapTaskOnCgraPass(); // Optimization Passes //=========================================================// std::unique_ptr createAffineLoopTreeSerializationPass(); +std::unique_ptr createAffineLoopPerfectionPass(); #define GEN_PASS_REGISTRATION #include "TaskflowDialect/TaskflowPasses.h.inc" diff --git a/include/TaskflowDialect/TaskflowPasses.td b/include/TaskflowDialect/TaskflowPasses.td index d41ae666..4bb69caf 100644 --- a/include/TaskflowDialect/TaskflowPasses.td +++ b/include/TaskflowDialect/TaskflowPasses.td @@ -21,6 +21,19 @@ def AffineLoopTreeSerialization : Pass<"affine-loop-tree-serialization", "Module "mlir::func::FuncDialect"]; } +def AffineLoopPerfection : Pass<"affine-loop-perfection", "func::FuncOp">{ + let summary = "Perfectionizes affine.for loops into perfect nested loop bands"; + let description = [{ + This pass transforms affine.for loops into perfect nested loop bands by + applying loop transformations such as loop fusion, loop interchange, and + loop tiling. + }]; + let constructor = "taskflow::createAffineLoopPerfectionPass()"; + let dependentDialects = [ + "mlir::affine::AffineDialect", + "mlir::func::FuncDialect"]; +} + //=========================================================// // Passes for the Taskflow dialect //=========================================================// diff --git a/lib/TaskflowDialect/Transforms/Optimizations/AffineLoopPerfectionPass.cpp b/lib/TaskflowDialect/Transforms/Optimizations/AffineLoopPerfectionPass.cpp new file mode 100644 index 00000000..15195684 --- /dev/null +++ b/lib/TaskflowDialect/Transforms/Optimizations/AffineLoopPerfectionPass.cpp @@ -0,0 +1,267 @@ +#include "TaskflowDialect/TaskflowDialect.h" +#include "TaskflowDialect/TaskflowOps.h" +#include "TaskflowDialect/TaskflowPasses.h" + +#include "mlir/Dialect/Affine/IR/AffineOps.h" +#include "mlir/Dialect/Arith/IR/Arith.h" +#include "mlir/Dialect/Func/IR/FuncOps.h" +#include "mlir/Dialect/MemRef/IR/MemRef.h" +#include "mlir/Dialect/SCF/IR/SCF.h" +#include "mlir/IR/Builders.h" +#include "mlir/IR/BuiltinTypes.h" +#include "mlir/Pass/Pass.h" +#include "mlir/Support/TypeID.h" +#include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/Support/raw_ostream.h" + +using namespace mlir; +using namespace mlir::taskflow; + +namespace { +//================================================================= +// Affine Loop Band Structure. +//================================================================= + +// A loop band can be classified into two types: + +// 1) Perfect Loop Band: A sequence of perfectly nested loops where each loop +// (except the innermost) has exactly one child loop and no other operations (no +// prologue/epilogue); +// 2) Imperfect Loop Band: A sequence of nested loops that +// do not satisfy the perfect nesting condition (e.g., loops with exactly one +// child loop, but with other operations in the body). +using AffineLoopBand = SmallVector; + +// Checks if an operation is side-effect-free (pure computation). +static bool hasSideEffect(Operation *op) { + // Yield operations are terminators, not computations. + if (isa(op)) { + return true; + } + + // Arithmetic and pure operations. + if (isa(op->getDialect())) { + return false; + } + + // affine.load or memref.load is considered side-effect-free (read-only). + if (isa(op) || isa(op)) { + return false; + } + + // affine.store and memref.store are side-effecting (write operations). + if (isa(op)) { + return true; + } + + // For other operations, conservatively assumes they have side effects. + return true; +} + +// Collects loop bands from a function. +static void collectLoopBands(func::FuncOp func_op, + SmallVector &loop_bands) { + func_op.walk([&](affine::AffineForOp for_op) { + // Only processes outermost loops (skips nested loops). + if (for_op->getParentOfType()) { + return; + } + + AffineLoopBand current_band; + affine::AffineForOp current_loop = for_op; + + // Follows the nesting chain to build the perfect loop band. + while (current_loop) { + current_band.push_back(current_loop); + + // Checks if body has exactly one nested loop (perfect nesting). + Block &body = current_loop.getRegion().front(); + affine::AffineForOp nested_loop = nullptr; + size_t num_loops = 0; + size_t num_other_ops = 0; + + for (Operation &body_op : body) { + if (auto nested_for = dyn_cast(&body_op)) { + nested_loop = nested_for; + num_loops++; + } else if (!isa(&body_op)) { + // Counts other operations (excluding yield). + num_other_ops++; + } + } + + // Loop bands condition: exactly 1 nested loop, any number of other ops + // (other ops will be perfectized). + if (num_loops == 1) { + current_loop = nested_loop; + } else { + // Has multiple nested loops, not loop bands. + break; + } + } + + if (!current_band.empty()) { + loop_bands.push_back(current_band); + } + }); +} + +//================================================================= +// Loop Perfection Logic. +//================================================================= + +// Applies loop perfection to a single loop band. +// Sinks all operations into the innermost loop with condition execution. +static LogicalResult applyLoopPerfection(AffineLoopBand &loop_band) { + if (loop_band.empty()) { + return failure(); + } + + llvm::errs() << "[LoopPerfection] Processing loop band with " + << loop_band.size() << " loops.\n"; + + affine::AffineForOp innermost_loop = loop_band.back(); + OpBuilder builder(innermost_loop); + + // Processes each loop in the band from outermost to innermost. + for (size_t i = loop_band.size() - 1; i > 0; i--) { + affine::AffineForOp loop = loop_band[i - 1]; + affine::AffineForOp child_loop = loop_band[i]; + + // Collects prologue and epilogue operations in the current loop (excluding + // the child loop). + SmallVector prologue_ops; // Before child loop. + SmallVector epilogue_ops; // After child loop. + + bool is_prologue = true; + for (Operation &op : loop.getRegion().front()) { + if (&op == child_loop) { + is_prologue = false; + continue; + } + + if (isa(&op)) { + // Skips yield operations. + continue; + } + + // Rejects operations that cannot be perfectized. + if (llvm::any_of(op.getResultTypes(), + [](Type type) { return isa(type); })) { + llvm::errs() + << "[LoopPerfection] Memref-producing op cannot be perfectized.\n"; + op.dump(); + return failure(); + } + + if (isa(&op)) { + llvm::errs() + << "[LoopPerfection] Function call op cannot be perfectized.\n"; + op.dump(); + return failure(); + } + + if (is_prologue) { + prologue_ops.push_back(&op); + } else { + epilogue_ops.push_back(&op); + } + } + + if (prologue_ops.empty() && epilogue_ops.empty()) { + // No operations to perfect, continues to next loop. + continue; + } + + Location loc = loop.getLoc(); + Block &innermost_body = innermost_loop.getRegion().front(); + + // Gets all inner loops (from current child to innermost loop). + ArrayRef inner_loops = + ArrayRef(loop_band).drop_front(i); + + // Handles prologue operations. + if (!prologue_ops.empty()) { + llvm::errs() << " Moving " << prologue_ops.size() + << " prologue operations\n"; + + Operation *insert_point = &innermost_body.front(); + + // Seperates pure and side-effecting operations in the prologue. + SmallVector pure_ops; + SmallVector side_effect_ops; + + for (Operation *op : prologue_ops) { + if (hasSideEffect(op)) { + side_effect_ops.push_back(op); + } else { + pure_ops.push_back(op); + } + } + + // Moves pure operations directly into the innermost loop (will be CSE'd + // if redundant). + for (Operation *op : pure_ops) { + op->moveBefore(insert_point); + } + + // Moves side-effecting operations into the innermost loop with condition + // execution. + if (!side_effect_ops.empty()) { + builder.setInsertionPoint(insert_point); + Value condition = createPrologueCondition(builder, loc, inner_loops); + } + } + } +} + +//================================================================= +// Pass Implementation. +//================================================================= +struct AffineLoopPerfectionPass + : public PassWrapper> { + MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(AffineLoopPerfectionPass) + + StringRef getArgument() const final { return "affine-loop-perfection"; } + StringRef getDescription() const final { + return "Apply loop perfection for affine loops."; + } + + void getDependentDialects(DialectRegistry ®istry) const override { + registry + .insert(); + } + + void runOnOperation() override { + func::FuncOp func_op = getOperation(); + // Collects all loop bands in the function. + SmallVector loop_bands; + collectLoopBands(func_op, loop_bands); + + if (loop_bands.empty()) { + llvm::errs() << "[LoopPerfection] No loop bands found in function: " + << func_op.getName() << "\n"; + return; + } + + llvm::errs() << "[LoopPerfection] Found " << loop_bands.size() + << " loop bands in function: " << func_op.getName() << "\n"; + + // Apply loop perfection to each loop band. + for (AffineLoopBand &band : loop_bands) { + if (failed(applyLoopPerfection(band))) { + signalPassFailure(); + return; + } + } + } +}; +} // namespace + +std::unique_ptr mlir::taskflow::createAffineLoopPerfectionPass() { + return std::make_unique(); +} \ No newline at end of file diff --git a/lib/TaskflowDialect/Transforms/Optimizations/CMakeLists.txt b/lib/TaskflowDialect/Transforms/Optimizations/CMakeLists.txt index 3e1ce5cd..2200f5b1 100644 --- a/lib/TaskflowDialect/Transforms/Optimizations/CMakeLists.txt +++ b/lib/TaskflowDialect/Transforms/Optimizations/CMakeLists.txt @@ -2,6 +2,7 @@ include_directories(${CMAKE_CURRENT_BINARY_DIR}) add_mlir_conversion_library(MLIRTaskflowOptimization AffineLoopTreeSerializationPass.cpp + AffineLoopPerfectionPass.cpp DEPENDS MLIRTaskflowTransformsIncGen diff --git a/test/benchmark/CGRA-Bench b/test/benchmark/CGRA-Bench index 2b5e78b2..2beecc59 160000 --- a/test/benchmark/CGRA-Bench +++ b/test/benchmark/CGRA-Bench @@ -1 +1 @@ -Subproject commit 2b5e78b24d481c8465c82672a8d5177a86119aed +Subproject commit 2beecc599bd268f8665344ba2271f48c97db7aa0 From c45ce492c1a1a6fa147b0e071e4981110cba6020 Mon Sep 17 00:00:00 2001 From: ShangkunLI Date: Fri, 6 Feb 2026 12:46:49 +0800 Subject: [PATCH 2/5] enable affine loop perfection optimization --- .../AffineLoopPerfectionPass.cpp | 157 ++++++++++++++++-- 1 file changed, 144 insertions(+), 13 deletions(-) diff --git a/lib/TaskflowDialect/Transforms/Optimizations/AffineLoopPerfectionPass.cpp b/lib/TaskflowDialect/Transforms/Optimizations/AffineLoopPerfectionPass.cpp index 15195684..34f82c36 100644 --- a/lib/TaskflowDialect/Transforms/Optimizations/AffineLoopPerfectionPass.cpp +++ b/lib/TaskflowDialect/Transforms/Optimizations/AffineLoopPerfectionPass.cpp @@ -1,5 +1,3 @@ -#include "TaskflowDialect/TaskflowDialect.h" -#include "TaskflowDialect/TaskflowOps.h" #include "TaskflowDialect/TaskflowPasses.h" #include "mlir/Dialect/Affine/IR/AffineOps.h" @@ -11,7 +9,6 @@ #include "mlir/IR/BuiltinTypes.h" #include "mlir/Pass/Pass.h" #include "mlir/Support/TypeID.h" -#include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallVector.h" #include "llvm/Support/raw_ostream.h" @@ -80,15 +77,11 @@ static void collectLoopBands(func::FuncOp func_op, Block &body = current_loop.getRegion().front(); affine::AffineForOp nested_loop = nullptr; size_t num_loops = 0; - size_t num_other_ops = 0; for (Operation &body_op : body) { if (auto nested_for = dyn_cast(&body_op)) { nested_loop = nested_for; num_loops++; - } else if (!isa(&body_op)) { - // Counts other operations (excluding yield). - num_other_ops++; } } @@ -112,6 +105,75 @@ static void collectLoopBands(func::FuncOp func_op, // Loop Perfection Logic. //================================================================= +// Creates a condition checking if all inner loop indices are at their lower +// bounds. Used for prologue condition. +static Value +createPrologueCondition(OpBuilder &builder, Location loc, + ArrayRef inner_loops) { + // Builds condition for prologue code: (i1 == lb1) && (i2 == lb2) && ... + Value condition = nullptr; + + for (affine::AffineForOp loop : inner_loops) { + Value idx = loop.getInductionVar(); + Value lb; + + if (loop.hasConstantLowerBound()) { + lb = builder.create(loc, + loop.getConstantLowerBound()); + } else { + llvm::errs() + << "[LoopPerfection] Non-constant lower bound not supported.\n"; + return nullptr; + } + + Value eq = + builder.create(loc, arith::CmpIPredicate::eq, idx, lb); + + if (condition) { + condition = builder.create(loc, condition, eq); + } else { + condition = eq; + } + } + + return condition; +} + +// Creates a condition checking if all inner loop indices are at their upper +// bounds. Used for epilogue condition. +static Value +createEpilogueCondition(OpBuilder &builder, Location loc, + ArrayRef inner_loops) { + // Builds condition for epilogue code: (i1 == ub1 - 1) && (i2 == ub2 - 1) && + // ... + Value condition = nullptr; + + for (affine::AffineForOp loop : inner_loops) { + Value idx = loop.getInductionVar(); + Value ub_minus_1; + + if (loop.hasConstantUpperBound()) { + ub_minus_1 = builder.create( + loc, loop.getConstantUpperBound() - 1); + } else { + llvm::errs() + << "[LoopPerfection] Non-constant upper bound not supported.\n"; + return nullptr; + } + + Value eq = builder.create(loc, arith::CmpIPredicate::eq, idx, + ub_minus_1); + + if (condition) { + condition = builder.create(loc, condition, eq); + } else { + condition = eq; + } + } + + return condition; +} + // Applies loop perfection to a single loop band. // Sinks all operations into the innermost loop with condition execution. static LogicalResult applyLoopPerfection(AffineLoopBand &loop_band) { @@ -130,8 +192,8 @@ static LogicalResult applyLoopPerfection(AffineLoopBand &loop_band) { affine::AffineForOp loop = loop_band[i - 1]; affine::AffineForOp child_loop = loop_band[i]; - // Collects prologue and epilogue operations in the current loop (excluding - // the child loop). + // Collects prologue and epilogue operations in the current loop + // (excluding the child loop). SmallVector prologue_ops; // Before child loop. SmallVector epilogue_ops; // After child loop. @@ -150,8 +212,8 @@ static LogicalResult applyLoopPerfection(AffineLoopBand &loop_band) { // Rejects operations that cannot be perfectized. if (llvm::any_of(op.getResultTypes(), [](Type type) { return isa(type); })) { - llvm::errs() - << "[LoopPerfection] Memref-producing op cannot be perfectized.\n"; + llvm::errs() << "[LoopPerfection] Memref-producing op cannot be " + "perfectized.\n"; op.dump(); return failure(); } @@ -207,14 +269,83 @@ static LogicalResult applyLoopPerfection(AffineLoopBand &loop_band) { op->moveBefore(insert_point); } - // Moves side-effecting operations into the innermost loop with condition - // execution. + // Moves side-effecting operations into the innermost loop with + // condition execution. if (!side_effect_ops.empty()) { builder.setInsertionPoint(insert_point); Value condition = createPrologueCondition(builder, loc, inner_loops); + + if (condition) { + scf::IfOp if_op = builder.create(loc, condition, + /*withElseRegion*/ false); + + Block *then_block = if_op.thenBlock(); + + for (Operation *op : side_effect_ops) { + op->moveBefore(then_block->getTerminator()); + } + } else { + // If condition creation fails, returns failure to avoid + // incorrect transformation. + llvm::errs() + << "[LoopPerfection] Failed to create prologue condition.\n"; + return failure(); + } + } + } + + // Handles epilogue operations. + if (!epilogue_ops.empty()) { + llvm::errs() << " Moving " << epilogue_ops.size() + << " epilogue operations\n"; + + Operation *insert_point = innermost_body.getTerminator(); + + // Separates pure and side-effecting operations in the epilogue. + SmallVector pure_ops; + SmallVector side_effect_ops; + + for (Operation *op : epilogue_ops) { + if (hasSideEffect(op)) { + side_effect_ops.push_back(op); + } else { + pure_ops.push_back(op); + } + } + + // Moves pure operations directly into the innermost loop (will be CSE'd + // if redundant). + for (Operation *op : pure_ops) { + op->moveBefore(insert_point); + } + + // Moves side-effecting operations into the innermost loop with + // condition execution. + if (!side_effect_ops.empty()) { + builder.setInsertionPoint(insert_point); + Value condition = createEpilogueCondition(builder, loc, inner_loops); + + if (condition) { + scf::IfOp if_op = builder.create(loc, condition, + /*withElseRegion*/ false); + + Block *then_block = if_op.thenBlock(); + + for (Operation *op : side_effect_ops) { + op->moveBefore(then_block->getTerminator()); + } + } else { + // If condition creation fails, returns failure to avoid + // incorrect transformation. + llvm::errs() + << "[LoopPerfection] Failed to create epilogue condition.\n"; + return failure(); + } } } } + + return success(); } //================================================================= From 61922156b45f858043460eb689d880ae2864b523 Mon Sep 17 00:00:00 2001 From: ShangkunLI Date: Fri, 6 Feb 2026 14:36:38 +0800 Subject: [PATCH 3/5] unstage submodule changes --- .../AffineLoopPerfectionPass.cpp | 28 +++++++--- .../irregular-loop/irregular-loop.mlir | 44 +++++++++++++++ .../taskflow/multi-nested/multi-nested.mlir | 56 +++++++++++++++++++ 3 files changed, 121 insertions(+), 7 deletions(-) diff --git a/lib/TaskflowDialect/Transforms/Optimizations/AffineLoopPerfectionPass.cpp b/lib/TaskflowDialect/Transforms/Optimizations/AffineLoopPerfectionPass.cpp index 34f82c36..f22c1a66 100644 --- a/lib/TaskflowDialect/Transforms/Optimizations/AffineLoopPerfectionPass.cpp +++ b/lib/TaskflowDialect/Transforms/Optimizations/AffineLoopPerfectionPass.cpp @@ -150,24 +150,38 @@ createEpilogueCondition(OpBuilder &builder, Location loc, for (affine::AffineForOp loop : inner_loops) { Value idx = loop.getInductionVar(); - Value ub_minus_1; + Value next_idx; // idx + step + Value ub; + + // Gets step. + int32_t step_val = 1; + if (loop.getStepAsInt()) { + step_val = loop.getStepAsInt(); + } else { + llvm::errs() << "[LoopPerfection] Non-constant step not supported.\n"; + return nullptr; + } + + // Computes next_idx = idx + step. + Value step = builder.create(loc, step_val); + next_idx = builder.create(loc, idx, step); if (loop.hasConstantUpperBound()) { - ub_minus_1 = builder.create( - loc, loop.getConstantUpperBound() - 1); + ub = builder.create(loc, + loop.getConstantUpperBound()); } else { llvm::errs() << "[LoopPerfection] Non-constant upper bound not supported.\n"; return nullptr; } - Value eq = builder.create(loc, arith::CmpIPredicate::eq, idx, - ub_minus_1); + Value is_last = builder.create( + loc, arith::CmpIPredicate::sge, next_idx, ub); if (condition) { - condition = builder.create(loc, condition, eq); + condition = builder.create(loc, condition, is_last); } else { - condition = eq; + condition = is_last; } } diff --git a/test/multi-cgra/taskflow/irregular-loop/irregular-loop.mlir b/test/multi-cgra/taskflow/irregular-loop/irregular-loop.mlir index 19fa277b..2a4eb496 100644 --- a/test/multi-cgra/taskflow/irregular-loop/irregular-loop.mlir +++ b/test/multi-cgra/taskflow/irregular-loop/irregular-loop.mlir @@ -2,6 +2,11 @@ // RUN: -o %t.serialized.mlir // RUN: FileCheck %s --input-file=%t.serialized.mlir --check-prefixes=SERIALIZED +// RUN: mlir-neura-opt %s --affine-loop-tree-serialization \ +// RUN: --affine-loop-perfection \ +// RUN: -o %t.perfect.mlir +// RUN: FileCheck %s --input-file=%t.perfect.mlir --check-prefixes=PERFECT + // RUN: mlir-neura-opt %s --affine-loop-tree-serialization \ // RUN: --convert-affine-to-taskflow \ // RUN: -o %t.taskflow.mlir @@ -103,6 +108,45 @@ module attributes {} { // SERIALIZED-NEXT: } // SERIALIZED-NEXT: } +// PERFECT: module { +// PERFECT-NEXT: func.func @_Z21irregularLoopExample1v() -> i32 attributes {llvm.linkage = #llvm.linkage} { +// PERFECT-NEXT: %c2_i32 = arith.constant 2 : i32 +// PERFECT-NEXT: %c8_i32 = arith.constant 8 : i32 +// PERFECT-NEXT: %c0_i32 = arith.constant 0 : i32 +// PERFECT-NEXT: %alloca = memref.alloca() : memref +// PERFECT-NEXT: %alloca_0 = memref.alloca() : memref<4x8xi32> +// PERFECT-NEXT: %0 = affine.for %arg0 = 0 to 5 iter_args(%arg1 = %c0_i32) -> (i32) { +// PERFECT-NEXT: %2 = arith.index_cast %arg0 : index to i32 +// PERFECT-NEXT: %3 = arith.addi %arg1, %2 : i32 +// PERFECT-NEXT: affine.yield %3 : i32 +// PERFECT-NEXT: } +// PERFECT-NEXT: affine.for %arg0 = 0 to 4 { +// PERFECT-NEXT: affine.for %arg1 = 0 to 8 { +// PERFECT-NEXT: %2 = arith.index_cast %arg0 : index to i32 +// PERFECT-NEXT: %3 = arith.muli %2, %c8_i32 : i32 +// PERFECT-NEXT: %4 = arith.index_cast %arg1 : index to i32 +// PERFECT-NEXT: %5 = arith.addi %3, %4 : i32 +// PERFECT-NEXT: affine.store %5, %alloca_0[%arg0, %arg1] : memref<4x8xi32> +// PERFECT-NEXT: } +// PERFECT-NEXT: } +// PERFECT-NEXT: affine.for %arg0 = 0 to 4 { +// PERFECT-NEXT: affine.for %arg1 = 0 to 8 { +// PERFECT-NEXT: %2 = arith.index_cast %arg0 : index to i32 +// PERFECT-NEXT: %3 = arith.muli %2, %c8_i32 : i32 +// PERFECT-NEXT: %4 = affine.load %alloca_0[%arg0, %arg1] : memref<4x8xi32> +// PERFECT-NEXT: %5 = arith.addi %4, %0 : i32 +// PERFECT-NEXT: affine.if #set(%arg0, %arg1) { +// PERFECT-NEXT: affine.store %5, %alloca[] : memref +// PERFECT-NEXT: %6 = arith.muli %5, %c2_i32 : i32 +// PERFECT-NEXT: affine.store %6, %alloca[] : memref +// PERFECT-NEXT: } +// PERFECT-NEXT: } +// PERFECT-NEXT: } +// PERFECT-NEXT: %1 = affine.load %alloca[] : memref +// PERFECT-NEXT: return %1 : i32 +// PERFECT-NEXT: } +// PERFECT-NEXT: } + // TASKFLOW: #set = affine_set<(d0, d1) : (d0 - 3 == 0, d1 - 7 == 0)> // TASKFLOW-NEXT: module { // TASKFLOW-NEXT: func.func @_Z21irregularLoopExample1v() -> i32 attributes {llvm.linkage = #llvm.linkage} { diff --git a/test/multi-cgra/taskflow/multi-nested/multi-nested.mlir b/test/multi-cgra/taskflow/multi-nested/multi-nested.mlir index ebdbe079..e6376f44 100644 --- a/test/multi-cgra/taskflow/multi-nested/multi-nested.mlir +++ b/test/multi-cgra/taskflow/multi-nested/multi-nested.mlir @@ -2,6 +2,11 @@ // RUN: -o %t.serialized.mlir // RUN: FileCheck %s --input-file=%t.serialized.mlir --check-prefixes=SERIALIZED +// RUN: mlir-neura-opt %s --affine-loop-tree-serialization \ +// RUN: --affine-loop-perfection \ +// RUN: -o %t.perfect.mlir +// RUN: FileCheck %s --input-file=%t.perfect.mlir --check-prefixes=PERFECT + // RUN: mlir-neura-opt %s --affine-loop-tree-serialization \ // RUN: --convert-affine-to-taskflow \ // RUN: -o %t.taskflow.mlir @@ -118,6 +123,57 @@ module attributes {} { // SERIALIZED-NEXT: } // SERIALIZED-NEXT: } +// PERFECT: module { +// PERFECT-NEXT: func.func @_Z21pureNestedLoopExamplePA8_A6_iPA8_A5_iS4_PA7_iPA9_iPiS9_S9_S9_S9_(%arg0: memref, %arg1: memref, %arg2: memref, %arg3: memref, %arg4: memref, %arg5: memref, %arg6: memref, %arg7: memref, %arg8: memref, %arg9: memref) -> i32 attributes {llvm.linkage = #llvm.linkage} { +// PERFECT-NEXT: affine.for %arg10 = 0 to 4 { +// PERFECT-NEXT: affine.for %arg11 = 0 to 8 { +// PERFECT-NEXT: affine.for %arg12 = 0 to 6 { +// PERFECT-NEXT: %1 = affine.load %arg0[%arg10, %arg11, %arg12] : memref +// PERFECT-NEXT: affine.store %1, %arg5[%arg12] : memref +// PERFECT-NEXT: } +// PERFECT-NEXT: } +// PERFECT-NEXT: } +// PERFECT-NEXT: affine.for %arg10 = 0 to 4 { +// PERFECT-NEXT: affine.for %arg11 = 0 to 8 { +// PERFECT-NEXT: affine.for %arg12 = 0 to 5 { +// PERFECT-NEXT: %1 = affine.load %arg1[%arg10, %arg11, %arg12] : memref +// PERFECT-NEXT: %2 = affine.load %arg2[%arg10, %arg11, %arg12] : memref +// PERFECT-NEXT: %3 = arith.addi %1, %2 : i32 +// PERFECT-NEXT: affine.store %3, %arg6[%arg12] : memref +// PERFECT-NEXT: } +// PERFECT-NEXT: } +// PERFECT-NEXT: } +// PERFECT-NEXT: affine.for %arg10 = 0 to 4 { +// PERFECT-NEXT: affine.for %arg11 = 0 to 8 { +// PERFECT-NEXT: affine.for %arg12 = 0 to 6 { +// PERFECT-NEXT: %1 = affine.load %arg5[%arg12] : memref +// PERFECT-NEXT: %2 = affine.load %arg6[%arg12] : memref +// PERFECT-NEXT: %3 = arith.addi %1, %2 : i32 +// PERFECT-NEXT: %4 = affine.load %arg9[0] : memref +// PERFECT-NEXT: %5 = arith.addi %4, %3 : i32 +// PERFECT-NEXT: affine.store %5, %arg9[0] : memref +// PERFECT-NEXT: } +// PERFECT-NEXT: } +// PERFECT-NEXT: } +// PERFECT-NEXT: affine.for %arg10 = 0 to 4 { +// PERFECT-NEXT: affine.for %arg11 = 0 to 7 { +// PERFECT-NEXT: %1 = affine.load %arg3[%arg10, %arg11] : memref +// PERFECT-NEXT: affine.store %1, %arg7[%arg11] : memref +// PERFECT-NEXT: } +// PERFECT-NEXT: } +// PERFECT-NEXT: affine.for %arg10 = 0 to 4 { +// PERFECT-NEXT: affine.for %arg11 = 0 to 9 { +// PERFECT-NEXT: %1 = affine.load %arg4[%arg10, %arg11] : memref +// PERFECT-NEXT: %2 = affine.load %arg7[%arg11] : memref +// PERFECT-NEXT: %3 = arith.addi %1, %2 : i32 +// PERFECT-NEXT: affine.store %3, %arg8[%arg11] : memref +// PERFECT-NEXT: } +// PERFECT-NEXT: } +// PERFECT-NEXT: %0 = affine.load %arg9[0] : memref +// PERFECT-NEXT: return %0 : i32 +// PERFECT-NEXT: } +// PERFECT-NEXT: } + // TASKFLOW: module { // TASKFLOW-NEXT: func.func @_Z21pureNestedLoopExamplePA8_A6_iPA8_A5_iS4_PA7_iPA9_iPiS9_S9_S9_S9_(%arg0: memref, %arg1: memref, %arg2: memref, %arg3: memref, %arg4: memref, %arg5: memref, %arg6: memref, %arg7: memref, %arg8: memref, %arg9: memref) -> i32 attributes {llvm.linkage = #llvm.linkage} { // TASKFLOW-NEXT: %write_outputs = taskflow.task @Task_0 read_memrefs(%arg0 : memref) write_memrefs(%arg5 : memref) [original_read_memrefs(%arg0 : memref), original_write_memrefs(%arg5 : memref)] : (memref, memref) -> (memref) { From f9ff3d112415c79c7be74a99aa71a6fdb7ff9aff Mon Sep 17 00:00:00 2001 From: ShangkunLI Date: Fri, 6 Feb 2026 15:17:11 +0800 Subject: [PATCH 4/5] sync cgrabench --- test/benchmark/CGRA-Bench | 2 +- test/benchmark/Zeonica_Testbench | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/test/benchmark/CGRA-Bench b/test/benchmark/CGRA-Bench index 2beecc59..f130a221 160000 --- a/test/benchmark/CGRA-Bench +++ b/test/benchmark/CGRA-Bench @@ -1 +1 @@ -Subproject commit 2beecc599bd268f8665344ba2271f48c97db7aa0 +Subproject commit f130a22161890f9f52e16651b2dd28008e4ff03f diff --git a/test/benchmark/Zeonica_Testbench b/test/benchmark/Zeonica_Testbench index 45e85e44..c7590d83 160000 --- a/test/benchmark/Zeonica_Testbench +++ b/test/benchmark/Zeonica_Testbench @@ -1 +1 @@ -Subproject commit 45e85e44d58670e08a88dfcebfd471909699ae2c +Subproject commit c7590d836df404dca078c4c5104c39673100a4af From fac94aedf28c879c668509c6b2949d29002084e3 Mon Sep 17 00:00:00 2001 From: ShangkunLI Date: Fri, 6 Feb 2026 15:22:17 +0800 Subject: [PATCH 5/5] update cgrabench --- test/benchmark/CGRA-Bench | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/benchmark/CGRA-Bench b/test/benchmark/CGRA-Bench index f130a221..2b5e78b2 160000 --- a/test/benchmark/CGRA-Bench +++ b/test/benchmark/CGRA-Bench @@ -1 +1 @@ -Subproject commit f130a22161890f9f52e16651b2dd28008e4ff03f +Subproject commit 2b5e78b24d481c8465c82672a8d5177a86119aed