From 35524b9c5b32ef9851c95060d0e2a52d43e16750 Mon Sep 17 00:00:00 2001
From: ShangkunLI <shangkun.li@connect.ust.hk>
Date: Thu, 5 Feb 2026 23:27:36 +0800
Subject: [PATCH 1/5] prototype affine loop perfection pass

---
 include/TaskflowDialect/TaskflowPasses.h      |   1 +
 include/TaskflowDialect/TaskflowPasses.td     |  13 +
 .../AffineLoopPerfectionPass.cpp              | 267 ++++++++++++++++++
 .../Transforms/Optimizations/CMakeLists.txt   |   1 +
 test/benchmark/CGRA-Bench                     |   2 +-
 5 files changed, 283 insertions(+), 1 deletion(-)
 create mode 100644 lib/TaskflowDialect/Transforms/Optimizations/AffineLoopPerfectionPass.cpp

diff --git a/include/TaskflowDialect/TaskflowPasses.h b/include/TaskflowDialect/TaskflowPasses.h
index c50544c9..71a3b510 100644
--- a/include/TaskflowDialect/TaskflowPasses.h
+++ b/include/TaskflowDialect/TaskflowPasses.h
@@ -23,6 +23,7 @@ std::unique_ptr<mlir::Pass> createMapTaskOnCgraPass();
 // Optimization Passes
 //=========================================================//
 std::unique_ptr<mlir::Pass> createAffineLoopTreeSerializationPass();
+std::unique_ptr<mlir::Pass> createAffineLoopPerfectionPass();
 
 #define GEN_PASS_REGISTRATION
 #include "TaskflowDialect/TaskflowPasses.h.inc"
diff --git a/include/TaskflowDialect/TaskflowPasses.td b/include/TaskflowDialect/TaskflowPasses.td
index d41ae666..4bb69caf 100644
--- a/include/TaskflowDialect/TaskflowPasses.td
+++ b/include/TaskflowDialect/TaskflowPasses.td
@@ -21,6 +21,19 @@ def AffineLoopTreeSerialization : Pass<"affine-loop-tree-serialization", "Module
     "mlir::func::FuncDialect"];
 }
 
+def AffineLoopPerfection : Pass<"affine-loop-perfection", "func::FuncOp">{
+  let summary = "Perfectionizes affine.for loops into perfect nested loop bands";
+  let description = [{
+    This pass transforms affine.for loops into perfect nested loop bands by
+    applying loop transformations such as loop fusion, loop interchange, and
+    loop tiling.
+  }];
+  let constructor = "taskflow::createAffineLoopPerfectionPass()";
+  let dependentDialects = [
+    "mlir::affine::AffineDialect",
+    "mlir::func::FuncDialect"];
+}
+
 //=========================================================//
 // Passes for the Taskflow dialect
 //=========================================================//
diff --git a/lib/TaskflowDialect/Transforms/Optimizations/AffineLoopPerfectionPass.cpp b/lib/TaskflowDialect/Transforms/Optimizations/AffineLoopPerfectionPass.cpp
new file mode 100644
index 00000000..15195684
--- /dev/null
+++ b/lib/TaskflowDialect/Transforms/Optimizations/AffineLoopPerfectionPass.cpp
@@ -0,0 +1,267 @@
+#include "TaskflowDialect/TaskflowDialect.h"
+#include "TaskflowDialect/TaskflowOps.h"
+#include "TaskflowDialect/TaskflowPasses.h"
+
+#include "mlir/Dialect/Affine/IR/AffineOps.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/MemRef/IR/MemRef.h"
+#include "mlir/Dialect/SCF/IR/SCF.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/BuiltinTypes.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Support/TypeID.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace mlir;
+using namespace mlir::taskflow;
+
+namespace {
+//=================================================================
+// Affine Loop Band Structure.
+//=================================================================
+
+// A loop band can be classified into two types:
+
+// 1) Perfect Loop Band: A sequence of perfectly nested loops where each loop
+// (except the innermost) has exactly one child loop and no other operations (no
+// prologue/epilogue);
+// 2) Imperfect Loop Band: A sequence of nested loops that
+// do not satisfy the perfect nesting condition (e.g., loops with exactly one
+// child loop, but with other operations in the body).
+using AffineLoopBand = SmallVector<affine::AffineForOp>;
+
+// Checks if an operation is side-effect-free (pure computation).
+static bool hasSideEffect(Operation *op) {
+  // Yield operations are terminators, not computations.
+  if (isa<affine::AffineYieldOp>(op)) {
+    return true;
+  }
+
+  // Arithmetic and pure operations.
+  if (isa<arith::ArithDialect>(op->getDialect())) {
+    return false;
+  }
+
+  // affine.load or memref.load is considered side-effect-free (read-only).
+  if (isa<affine::AffineLoadOp>(op) || isa<memref::LoadOp>(op)) {
+    return false;
+  }
+
+  // affine.store and memref.store are side-effecting (write operations).
+  if (isa<affine::AffineStoreOp, memref::StoreOp>(op)) {
+    return true;
+  }
+
+  // For other operations, conservatively assumes they have side effects.
+  return true;
+}
+
+// Collects loop bands from a function.
+static void collectLoopBands(func::FuncOp func_op,
+                             SmallVector<AffineLoopBand> &loop_bands) {
+  func_op.walk([&](affine::AffineForOp for_op) {
+    // Only processes outermost loops (skips nested loops).
+    if (for_op->getParentOfType<affine::AffineForOp>()) {
+      return;
+    }
+
+    AffineLoopBand current_band;
+    affine::AffineForOp current_loop = for_op;
+
+    // Follows the nesting chain to build the perfect loop band.
+    while (current_loop) {
+      current_band.push_back(current_loop);
+
+      // Checks if body has exactly one nested loop (perfect nesting).
+      Block &body = current_loop.getRegion().front();
+      affine::AffineForOp nested_loop = nullptr;
+      size_t num_loops = 0;
+      size_t num_other_ops = 0;
+
+      for (Operation &body_op : body) {
+        if (auto nested_for = dyn_cast<affine::AffineForOp>(&body_op)) {
+          nested_loop = nested_for;
+          num_loops++;
+        } else if (!isa<affine::AffineYieldOp>(&body_op)) {
+          // Counts other operations (excluding yield).
+          num_other_ops++;
+        }
+      }
+
+      // Loop bands condition: exactly 1 nested loop, any number of other ops
+      // (other ops will be perfectized).
+      if (num_loops == 1) {
+        current_loop = nested_loop;
+      } else {
+        // Has multiple nested loops, not loop bands.
+        break;
+      }
+    }
+
+    if (!current_band.empty()) {
+      loop_bands.push_back(current_band);
+    }
+  });
+}
+
+//=================================================================
+// Loop Perfection Logic.
+//=================================================================
+
+// Applies loop perfection to a single loop band.
+// Sinks all operations into the innermost loop with condition execution.
+static LogicalResult applyLoopPerfection(AffineLoopBand &loop_band) {
+  if (loop_band.empty()) {
+    return failure();
+  }
+
+  llvm::errs() << "[LoopPerfection] Processing loop band with "
+               << loop_band.size() << " loops.\n";
+
+  affine::AffineForOp innermost_loop = loop_band.back();
+  OpBuilder builder(innermost_loop);
+
+  // Processes each loop in the band from outermost to innermost.
+  for (size_t i = loop_band.size() - 1; i > 0; i--) {
+    affine::AffineForOp loop = loop_band[i - 1];
+    affine::AffineForOp child_loop = loop_band[i];
+
+    // Collects prologue and epilogue operations in the current loop (excluding
+    // the child loop).
+    SmallVector<Operation *> prologue_ops; // Before child loop.
+    SmallVector<Operation *> epilogue_ops; // After child loop.
+
+    bool is_prologue = true;
+    for (Operation &op : loop.getRegion().front()) {
+      if (&op == child_loop) {
+        is_prologue = false;
+        continue;
+      }
+
+      if (isa<affine::AffineYieldOp>(&op)) {
+        // Skips yield operations.
+        continue;
+      }
+
+      // Rejects operations that cannot be perfectized.
+      if (llvm::any_of(op.getResultTypes(),
+                       [](Type type) { return isa<MemRefType>(type); })) {
+        llvm::errs()
+            << "[LoopPerfection] Memref-producing op cannot be perfectized.\n";
+        op.dump();
+        return failure();
+      }
+
+      if (isa<func::CallOp>(&op)) {
+        llvm::errs()
+            << "[LoopPerfection] Function call op cannot be perfectized.\n";
+        op.dump();
+        return failure();
+      }
+
+      if (is_prologue) {
+        prologue_ops.push_back(&op);
+      } else {
+        epilogue_ops.push_back(&op);
+      }
+    }
+
+    if (prologue_ops.empty() && epilogue_ops.empty()) {
+      // No operations to perfect, continues to next loop.
+      continue;
+    }
+
+    Location loc = loop.getLoc();
+    Block &innermost_body = innermost_loop.getRegion().front();
+
+    // Gets all inner loops (from current child to innermost loop).
+    ArrayRef<affine::AffineForOp> inner_loops =
+        ArrayRef<affine::AffineForOp>(loop_band).drop_front(i);
+
+    // Handles prologue operations.
+    if (!prologue_ops.empty()) {
+      llvm::errs() << "  Moving " << prologue_ops.size()
+                   << " prologue operations\n";
+
+      Operation *insert_point = &innermost_body.front();
+
+      // Seperates pure and side-effecting operations in the prologue.
+      SmallVector<Operation *> pure_ops;
+      SmallVector<Operation *> side_effect_ops;
+
+      for (Operation *op : prologue_ops) {
+        if (hasSideEffect(op)) {
+          side_effect_ops.push_back(op);
+        } else {
+          pure_ops.push_back(op);
+        }
+      }
+
+      // Moves pure operations directly into the innermost loop (will be CSE'd
+      // if redundant).
+      for (Operation *op : pure_ops) {
+        op->moveBefore(insert_point);
+      }
+
+      // Moves side-effecting operations into the innermost loop with condition
+      // execution.
+      if (!side_effect_ops.empty()) {
+        builder.setInsertionPoint(insert_point);
+        Value condition = createPrologueCondition(builder, loc, inner_loops);
+      }
+    }
+  }
+}
+
+//=================================================================
+// Pass Implementation.
+//=================================================================
+struct AffineLoopPerfectionPass
+    : public PassWrapper<AffineLoopPerfectionPass,
+                         OperationPass<func::FuncOp>> {
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(AffineLoopPerfectionPass)
+
+  StringRef getArgument() const final { return "affine-loop-perfection"; }
+  StringRef getDescription() const final {
+    return "Apply loop perfection for affine loops.";
+  }
+
+  void getDependentDialects(DialectRegistry &registry) const override {
+    registry
+        .insert<affine::AffineDialect, arith::ArithDialect,
+                memref::MemRefDialect, scf::SCFDialect, func::FuncDialect>();
+  }
+
+  void runOnOperation() override {
+    func::FuncOp func_op = getOperation();
+    // Collects all loop bands in the function.
+    SmallVector<AffineLoopBand> loop_bands;
+    collectLoopBands(func_op, loop_bands);
+
+    if (loop_bands.empty()) {
+      llvm::errs() << "[LoopPerfection] No loop bands found in function: "
+                   << func_op.getName() << "\n";
+      return;
+    }
+
+    llvm::errs() << "[LoopPerfection] Found " << loop_bands.size()
+                 << " loop bands in function: " << func_op.getName() << "\n";
+
+    // Apply loop perfection to each loop band.
+    for (AffineLoopBand &band : loop_bands) {
+      if (failed(applyLoopPerfection(band))) {
+        signalPassFailure();
+        return;
+      }
+    }
+  }
+};
+} // namespace
+
+std::unique_ptr<Pass> mlir::taskflow::createAffineLoopPerfectionPass() {
+  return std::make_unique<AffineLoopPerfectionPass>();
+}
\ No newline at end of file
diff --git a/lib/TaskflowDialect/Transforms/Optimizations/CMakeLists.txt b/lib/TaskflowDialect/Transforms/Optimizations/CMakeLists.txt
index 3e1ce5cd..2200f5b1 100644
--- a/lib/TaskflowDialect/Transforms/Optimizations/CMakeLists.txt
+++ b/lib/TaskflowDialect/Transforms/Optimizations/CMakeLists.txt
@@ -2,6 +2,7 @@ include_directories(${CMAKE_CURRENT_BINARY_DIR})
 
 add_mlir_conversion_library(MLIRTaskflowOptimization
   AffineLoopTreeSerializationPass.cpp
+  AffineLoopPerfectionPass.cpp
 
   DEPENDS
   MLIRTaskflowTransformsIncGen
diff --git a/test/benchmark/CGRA-Bench b/test/benchmark/CGRA-Bench
index 2b5e78b2..2beecc59 160000
--- a/test/benchmark/CGRA-Bench
+++ b/test/benchmark/CGRA-Bench
@@ -1 +1 @@
-Subproject commit 2b5e78b24d481c8465c82672a8d5177a86119aed
+Subproject commit 2beecc599bd268f8665344ba2271f48c97db7aa0

From c45ce492c1a1a6fa147b0e071e4981110cba6020 Mon Sep 17 00:00:00 2001
From: ShangkunLI <shangkun.li@connect.ust.hk>
Date: Fri, 6 Feb 2026 12:46:49 +0800
Subject: [PATCH 2/5] enable affine loop perfection optimization

---
 .../AffineLoopPerfectionPass.cpp              | 157 ++++++++++++++++--
 1 file changed, 144 insertions(+), 13 deletions(-)

diff --git a/lib/TaskflowDialect/Transforms/Optimizations/AffineLoopPerfectionPass.cpp b/lib/TaskflowDialect/Transforms/Optimizations/AffineLoopPerfectionPass.cpp
index 15195684..34f82c36 100644
--- a/lib/TaskflowDialect/Transforms/Optimizations/AffineLoopPerfectionPass.cpp
+++ b/lib/TaskflowDialect/Transforms/Optimizations/AffineLoopPerfectionPass.cpp
@@ -1,5 +1,3 @@
-#include "TaskflowDialect/TaskflowDialect.h"
-#include "TaskflowDialect/TaskflowOps.h"
 #include "TaskflowDialect/TaskflowPasses.h"
 
 #include "mlir/Dialect/Affine/IR/AffineOps.h"
@@ -11,7 +9,6 @@
 #include "mlir/IR/BuiltinTypes.h"
 #include "mlir/Pass/Pass.h"
 #include "mlir/Support/TypeID.h"
-#include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Support/raw_ostream.h"
@@ -80,15 +77,11 @@ static void collectLoopBands(func::FuncOp func_op,
       Block &body = current_loop.getRegion().front();
       affine::AffineForOp nested_loop = nullptr;
       size_t num_loops = 0;
-      size_t num_other_ops = 0;
 
       for (Operation &body_op : body) {
         if (auto nested_for = dyn_cast<affine::AffineForOp>(&body_op)) {
           nested_loop = nested_for;
           num_loops++;
-        } else if (!isa<affine::AffineYieldOp>(&body_op)) {
-          // Counts other operations (excluding yield).
-          num_other_ops++;
         }
       }
 
@@ -112,6 +105,75 @@ static void collectLoopBands(func::FuncOp func_op,
 // Loop Perfection Logic.
 //=================================================================
 
+// Creates a condition checking if all inner loop indices are at their lower
+// bounds. Used for prologue condition.
+static Value
+createPrologueCondition(OpBuilder &builder, Location loc,
+                        ArrayRef<affine::AffineForOp> inner_loops) {
+  // Builds condition for prologue code: (i1 == lb1) && (i2 == lb2) && ...
+  Value condition = nullptr;
+
+  for (affine::AffineForOp loop : inner_loops) {
+    Value idx = loop.getInductionVar();
+    Value lb;
+
+    if (loop.hasConstantLowerBound()) {
+      lb = builder.create<arith::ConstantIndexOp>(loc,
+                                                  loop.getConstantLowerBound());
+    } else {
+      llvm::errs()
+          << "[LoopPerfection] Non-constant lower bound not supported.\n";
+      return nullptr;
+    }
+
+    Value eq =
+        builder.create<arith::CmpIOp>(loc, arith::CmpIPredicate::eq, idx, lb);
+
+    if (condition) {
+      condition = builder.create<arith::AndIOp>(loc, condition, eq);
+    } else {
+      condition = eq;
+    }
+  }
+
+  return condition;
+}
+
+// Creates a condition checking if all inner loop indices are at their upper
+// bounds. Used for epilogue condition.
+static Value
+createEpilogueCondition(OpBuilder &builder, Location loc,
+                        ArrayRef<affine::AffineForOp> inner_loops) {
+  // Builds condition for epilogue code: (i1 == ub1 - 1) && (i2 == ub2 - 1) &&
+  // ...
+  Value condition = nullptr;
+
+  for (affine::AffineForOp loop : inner_loops) {
+    Value idx = loop.getInductionVar();
+    Value ub_minus_1;
+
+    if (loop.hasConstantUpperBound()) {
+      ub_minus_1 = builder.create<arith::ConstantIndexOp>(
+          loc, loop.getConstantUpperBound() - 1);
+    } else {
+      llvm::errs()
+          << "[LoopPerfection] Non-constant upper bound not supported.\n";
+      return nullptr;
+    }
+
+    Value eq = builder.create<arith::CmpIOp>(loc, arith::CmpIPredicate::eq, idx,
+                                             ub_minus_1);
+
+    if (condition) {
+      condition = builder.create<arith::AndIOp>(loc, condition, eq);
+    } else {
+      condition = eq;
+    }
+  }
+
+  return condition;
+}
+
 // Applies loop perfection to a single loop band.
 // Sinks all operations into the innermost loop with condition execution.
 static LogicalResult applyLoopPerfection(AffineLoopBand &loop_band) {
@@ -130,8 +192,8 @@ static LogicalResult applyLoopPerfection(AffineLoopBand &loop_band) {
     affine::AffineForOp loop = loop_band[i - 1];
     affine::AffineForOp child_loop = loop_band[i];
 
-    // Collects prologue and epilogue operations in the current loop (excluding
-    // the child loop).
+    // Collects prologue and epilogue operations in the current loop
+    // (excluding the child loop).
     SmallVector<Operation *> prologue_ops; // Before child loop.
     SmallVector<Operation *> epilogue_ops; // After child loop.
 
@@ -150,8 +212,8 @@ static LogicalResult applyLoopPerfection(AffineLoopBand &loop_band) {
       // Rejects operations that cannot be perfectized.
       if (llvm::any_of(op.getResultTypes(),
                        [](Type type) { return isa<MemRefType>(type); })) {
-        llvm::errs()
-            << "[LoopPerfection] Memref-producing op cannot be perfectized.\n";
+        llvm::errs() << "[LoopPerfection] Memref-producing op cannot be "
+                        "perfectized.\n";
         op.dump();
         return failure();
       }
@@ -207,14 +269,83 @@ static LogicalResult applyLoopPerfection(AffineLoopBand &loop_band) {
         op->moveBefore(insert_point);
       }
 
-      // Moves side-effecting operations into the innermost loop with condition
-      // execution.
+      // Moves side-effecting operations into the innermost loop with
+      // condition execution.
       if (!side_effect_ops.empty()) {
         builder.setInsertionPoint(insert_point);
         Value condition = createPrologueCondition(builder, loc, inner_loops);
+
+        if (condition) {
+          scf::IfOp if_op = builder.create<scf::IfOp>(loc, condition,
+                                                      /*withElseRegion*/ false);
+
+          Block *then_block = if_op.thenBlock();
+
+          for (Operation *op : side_effect_ops) {
+            op->moveBefore(then_block->getTerminator());
+          }
+        } else {
+          // If condition creation fails, returns failure to avoid
+          // incorrect transformation.
+          llvm::errs()
+              << "[LoopPerfection] Failed to create prologue condition.\n";
+          return failure();
+        }
+      }
+    }
+
+    // Handles epilogue operations.
+    if (!epilogue_ops.empty()) {
+      llvm::errs() << "  Moving " << epilogue_ops.size()
+                   << " epilogue operations\n";
+
+      Operation *insert_point = innermost_body.getTerminator();
+
+      // Separates pure and side-effecting operations in the epilogue.
+      SmallVector<Operation *> pure_ops;
+      SmallVector<Operation *> side_effect_ops;
+
+      for (Operation *op : epilogue_ops) {
+        if (hasSideEffect(op)) {
+          side_effect_ops.push_back(op);
+        } else {
+          pure_ops.push_back(op);
+        }
+      }
+
+      // Moves pure operations directly into the innermost loop (will be CSE'd
+      // if redundant).
+      for (Operation *op : pure_ops) {
+        op->moveBefore(insert_point);
+      }
+
+      // Moves side-effecting operations into the innermost loop with
+      // condition execution.
+      if (!side_effect_ops.empty()) {
+        builder.setInsertionPoint(insert_point);
+        Value condition = createEpilogueCondition(builder, loc, inner_loops);
+
+        if (condition) {
+          scf::IfOp if_op = builder.create<scf::IfOp>(loc, condition,
+                                                      /*withElseRegion*/ false);
+
+          Block *then_block = if_op.thenBlock();
+
+          for (Operation *op : side_effect_ops) {
+            op->moveBefore(then_block->getTerminator());
+          }
+        } else {
+          // If condition creation fails, returns failure to avoid
+          // incorrect transformation.
+          llvm::errs()
+              << "[LoopPerfection] Failed to create epilogue condition.\n";
+          return failure();
+        }
       }
     }
   }
+
+  return success();
 }
 
 //=================================================================

From 61922156b45f858043460eb689d880ae2864b523 Mon Sep 17 00:00:00 2001
From: ShangkunLI <shangkun.li@connect.ust.hk>
Date: Fri, 6 Feb 2026 14:36:38 +0800
Subject: [PATCH 3/5] unstage submodule changes

---
 .../AffineLoopPerfectionPass.cpp              | 28 +++++++---
 .../irregular-loop/irregular-loop.mlir        | 44 +++++++++++++++
 .../taskflow/multi-nested/multi-nested.mlir   | 56 +++++++++++++++++++
 3 files changed, 121 insertions(+), 7 deletions(-)

diff --git a/lib/TaskflowDialect/Transforms/Optimizations/AffineLoopPerfectionPass.cpp b/lib/TaskflowDialect/Transforms/Optimizations/AffineLoopPerfectionPass.cpp
index 34f82c36..f22c1a66 100644
--- a/lib/TaskflowDialect/Transforms/Optimizations/AffineLoopPerfectionPass.cpp
+++ b/lib/TaskflowDialect/Transforms/Optimizations/AffineLoopPerfectionPass.cpp
@@ -150,24 +150,38 @@ createEpilogueCondition(OpBuilder &builder, Location loc,
 
   for (affine::AffineForOp loop : inner_loops) {
     Value idx = loop.getInductionVar();
-    Value ub_minus_1;
+    Value next_idx; // idx + step
+    Value ub;
+
+    // Gets step.
+    int32_t step_val = 1;
+    if (loop.getStepAsInt()) {
+      step_val = loop.getStepAsInt();
+    } else {
+      llvm::errs() << "[LoopPerfection] Non-constant step not supported.\n";
+      return nullptr;
+    }
+
+    // Computes next_idx = idx + step.
+    Value step = builder.create<arith::ConstantIndexOp>(loc, step_val);
+    next_idx = builder.create<arith::AddIOp>(loc, idx, step);
 
     if (loop.hasConstantUpperBound()) {
-      ub_minus_1 = builder.create<arith::ConstantIndexOp>(
-          loc, loop.getConstantUpperBound() - 1);
+      ub = builder.create<arith::ConstantIndexOp>(loc,
+                                                  loop.getConstantUpperBound());
     } else {
       llvm::errs()
           << "[LoopPerfection] Non-constant upper bound not supported.\n";
       return nullptr;
     }
 
-    Value eq = builder.create<arith::CmpIOp>(loc, arith::CmpIPredicate::eq, idx,
-                                             ub_minus_1);
+    Value is_last = builder.create<arith::CmpIOp>(
+        loc, arith::CmpIPredicate::sge, next_idx, ub);
 
     if (condition) {
-      condition = builder.create<arith::AndIOp>(loc, condition, eq);
+      condition = builder.create<arith::AndIOp>(loc, condition, is_last);
     } else {
-      condition = eq;
+      condition = is_last;
     }
   }
 
diff --git a/test/multi-cgra/taskflow/irregular-loop/irregular-loop.mlir b/test/multi-cgra/taskflow/irregular-loop/irregular-loop.mlir
index 19fa277b..2a4eb496 100644
--- a/test/multi-cgra/taskflow/irregular-loop/irregular-loop.mlir
+++ b/test/multi-cgra/taskflow/irregular-loop/irregular-loop.mlir
@@ -2,6 +2,11 @@
 // RUN: -o %t.serialized.mlir
 // RUN: FileCheck %s --input-file=%t.serialized.mlir --check-prefixes=SERIALIZED
 
+// RUN: mlir-neura-opt %s --affine-loop-tree-serialization \
+// RUN: --affine-loop-perfection \
+// RUN: -o %t.perfect.mlir
+// RUN: FileCheck %s --input-file=%t.perfect.mlir --check-prefixes=PERFECT
+
 // RUN: mlir-neura-opt %s --affine-loop-tree-serialization \
 // RUN: --convert-affine-to-taskflow \
 // RUN: -o %t.taskflow.mlir
@@ -103,6 +108,45 @@ module attributes {} {
 // SERIALIZED-NEXT:   }
 // SERIALIZED-NEXT: }
 
+// PERFECT:      module {
+// PERFECT-NEXT:   func.func @_Z21irregularLoopExample1v() -> i32 attributes {llvm.linkage = #llvm.linkage<external>} {
+// PERFECT-NEXT:     %c2_i32 = arith.constant 2 : i32
+// PERFECT-NEXT:     %c8_i32 = arith.constant 8 : i32
+// PERFECT-NEXT:     %c0_i32 = arith.constant 0 : i32
+// PERFECT-NEXT:     %alloca = memref.alloca() : memref<i32>
+// PERFECT-NEXT:     %alloca_0 = memref.alloca() : memref<4x8xi32>
+// PERFECT-NEXT:     %0 = affine.for %arg0 = 0 to 5 iter_args(%arg1 = %c0_i32) -> (i32) {
+// PERFECT-NEXT:       %2 = arith.index_cast %arg0 : index to i32
+// PERFECT-NEXT:       %3 = arith.addi %arg1, %2 : i32
+// PERFECT-NEXT:       affine.yield %3 : i32
+// PERFECT-NEXT:     }
+// PERFECT-NEXT:     affine.for %arg0 = 0 to 4 {
+// PERFECT-NEXT:       affine.for %arg1 = 0 to 8 {
+// PERFECT-NEXT:         %2 = arith.index_cast %arg0 : index to i32
+// PERFECT-NEXT:         %3 = arith.muli %2, %c8_i32 : i32
+// PERFECT-NEXT:         %4 = arith.index_cast %arg1 : index to i32
+// PERFECT-NEXT:         %5 = arith.addi %3, %4 : i32
+// PERFECT-NEXT:         affine.store %5, %alloca_0[%arg0, %arg1] : memref<4x8xi32>
+// PERFECT-NEXT:       }
+// PERFECT-NEXT:     }
+// PERFECT-NEXT:     affine.for %arg0 = 0 to 4 {
+// PERFECT-NEXT:       affine.for %arg1 = 0 to 8 {
+// PERFECT-NEXT:         %2 = arith.index_cast %arg0 : index to i32
+// PERFECT-NEXT:         %3 = arith.muli %2, %c8_i32 : i32
+// PERFECT-NEXT:         %4 = affine.load %alloca_0[%arg0, %arg1] : memref<4x8xi32>
+// PERFECT-NEXT:         %5 = arith.addi %4, %0 : i32
+// PERFECT-NEXT:         affine.if #set(%arg0, %arg1) {
+// PERFECT-NEXT:           affine.store %5, %alloca[] : memref<i32>
+// PERFECT-NEXT:           %6 = arith.muli %5, %c2_i32 : i32
+// PERFECT-NEXT:           affine.store %6, %alloca[] : memref<i32>
+// PERFECT-NEXT:         }
+// PERFECT-NEXT:       }
+// PERFECT-NEXT:     }
+// PERFECT-NEXT:     %1 = affine.load %alloca[] : memref<i32>
+// PERFECT-NEXT:     return %1 : i32
+// PERFECT-NEXT:   }
+// PERFECT-NEXT: }
+
 // TASKFLOW:      #set = affine_set<(d0, d1) : (d0 - 3 == 0, d1 - 7 == 0)>
 // TASKFLOW-NEXT: module {
 // TASKFLOW-NEXT:   func.func @_Z21irregularLoopExample1v() -> i32 attributes {llvm.linkage = #llvm.linkage<external>} {
diff --git a/test/multi-cgra/taskflow/multi-nested/multi-nested.mlir b/test/multi-cgra/taskflow/multi-nested/multi-nested.mlir
index ebdbe079..e6376f44 100644
--- a/test/multi-cgra/taskflow/multi-nested/multi-nested.mlir
+++ b/test/multi-cgra/taskflow/multi-nested/multi-nested.mlir
@@ -2,6 +2,11 @@
 // RUN: -o %t.serialized.mlir
 // RUN: FileCheck %s --input-file=%t.serialized.mlir --check-prefixes=SERIALIZED
 
+// RUN: mlir-neura-opt %s --affine-loop-tree-serialization \
+// RUN: --affine-loop-perfection \
+// RUN: -o %t.perfect.mlir
+// RUN: FileCheck %s --input-file=%t.perfect.mlir --check-prefixes=PERFECT
+
 // RUN: mlir-neura-opt %s --affine-loop-tree-serialization \
 // RUN: --convert-affine-to-taskflow \
 // RUN: -o %t.taskflow.mlir
@@ -118,6 +123,57 @@ module attributes {} {
 // SERIALIZED-NEXT:   }
 // SERIALIZED-NEXT: }
 
+// PERFECT:      module {
+// PERFECT-NEXT:   func.func @_Z21pureNestedLoopExamplePA8_A6_iPA8_A5_iS4_PA7_iPA9_iPiS9_S9_S9_S9_(%arg0: memref<?x8x6xi32>, %arg1: memref<?x8x5xi32>, %arg2: memref<?x8x5xi32>, %arg3: memref<?x7xi32>, %arg4: memref<?x9xi32>, %arg5: memref<?xi32>, %arg6: memref<?xi32>, %arg7: memref<?xi32>, %arg8: memref<?xi32>, %arg9: memref<?xi32>) -> i32 attributes {llvm.linkage = #llvm.linkage<external>} {
+// PERFECT-NEXT:     affine.for %arg10 = 0 to 4 {
+// PERFECT-NEXT:       affine.for %arg11 = 0 to 8 {
+// PERFECT-NEXT:         affine.for %arg12 = 0 to 6 {
+// PERFECT-NEXT:           %1 = affine.load %arg0[%arg10, %arg11, %arg12] : memref<?x8x6xi32>
+// PERFECT-NEXT:           affine.store %1, %arg5[%arg12] : memref<?xi32>
+// PERFECT-NEXT:         }
+// PERFECT-NEXT:       }
+// PERFECT-NEXT:     }
+// PERFECT-NEXT:     affine.for %arg10 = 0 to 4 {
+// PERFECT-NEXT:       affine.for %arg11 = 0 to 8 {
+// PERFECT-NEXT:         affine.for %arg12 = 0 to 5 {
+// PERFECT-NEXT:           %1 = affine.load %arg1[%arg10, %arg11, %arg12] : memref<?x8x5xi32>
+// PERFECT-NEXT:           %2 = affine.load %arg2[%arg10, %arg11, %arg12] : memref<?x8x5xi32>
+// PERFECT-NEXT:           %3 = arith.addi %1, %2 : i32
+// PERFECT-NEXT:           affine.store %3, %arg6[%arg12] : memref<?xi32>
+// PERFECT-NEXT:         }
+// PERFECT-NEXT:       }
+// PERFECT-NEXT:     }
+// PERFECT-NEXT:     affine.for %arg10 = 0 to 4 {
+// PERFECT-NEXT:       affine.for %arg11 = 0 to 8 {
+// PERFECT-NEXT:         affine.for %arg12 = 0 to 6 {
+// PERFECT-NEXT:           %1 = affine.load %arg5[%arg12] : memref<?xi32>
+// PERFECT-NEXT:           %2 = affine.load %arg6[%arg12] : memref<?xi32>
+// PERFECT-NEXT:           %3 = arith.addi %1, %2 : i32
+// PERFECT-NEXT:           %4 = affine.load %arg9[0] : memref<?xi32>
+// PERFECT-NEXT:           %5 = arith.addi %4, %3 : i32
+// PERFECT-NEXT:           affine.store %5, %arg9[0] : memref<?xi32>
+// PERFECT-NEXT:         }
+// PERFECT-NEXT:       }
+// PERFECT-NEXT:     }
+// PERFECT-NEXT:     affine.for %arg10 = 0 to 4 {
+// PERFECT-NEXT:       affine.for %arg11 = 0 to 7 {
+// PERFECT-NEXT:         %1 = affine.load %arg3[%arg10, %arg11] : memref<?x7xi32>
+// PERFECT-NEXT:         affine.store %1, %arg7[%arg11] : memref<?xi32>
+// PERFECT-NEXT:       }
+// PERFECT-NEXT:     }
+// PERFECT-NEXT:     affine.for %arg10 = 0 to 4 {
+// PERFECT-NEXT:       affine.for %arg11 = 0 to 9 {
+// PERFECT-NEXT:         %1 = affine.load %arg4[%arg10, %arg11] : memref<?x9xi32>
+// PERFECT-NEXT:         %2 = affine.load %arg7[%arg11] : memref<?xi32>
+// PERFECT-NEXT:         %3 = arith.addi %1, %2 : i32
+// PERFECT-NEXT:         affine.store %3, %arg8[%arg11] : memref<?xi32>
+// PERFECT-NEXT:       }
+// PERFECT-NEXT:     }
+// PERFECT-NEXT:     %0 = affine.load %arg9[0] : memref<?xi32>
+// PERFECT-NEXT:     return %0 : i32
+// PERFECT-NEXT:   }
+// PERFECT-NEXT: }
+
 // TASKFLOW:      module {
 // TASKFLOW-NEXT:   func.func @_Z21pureNestedLoopExamplePA8_A6_iPA8_A5_iS4_PA7_iPA9_iPiS9_S9_S9_S9_(%arg0: memref<?x8x6xi32>, %arg1: memref<?x8x5xi32>, %arg2: memref<?x8x5xi32>, %arg3: memref<?x7xi32>, %arg4: memref<?x9xi32>, %arg5: memref<?xi32>, %arg6: memref<?xi32>, %arg7: memref<?xi32>, %arg8: memref<?xi32>, %arg9: memref<?xi32>) -> i32 attributes {llvm.linkage = #llvm.linkage<external>} {
 // TASKFLOW-NEXT:     %write_outputs = taskflow.task @Task_0 read_memrefs(%arg0 : memref<?x8x6xi32>) write_memrefs(%arg5 : memref<?xi32>) [original_read_memrefs(%arg0 : memref<?x8x6xi32>), original_write_memrefs(%arg5 : memref<?xi32>)] : (memref<?x8x6xi32>, memref<?xi32>) -> (memref<?xi32>) {

From f9ff3d112415c79c7be74a99aa71a6fdb7ff9aff Mon Sep 17 00:00:00 2001
From: ShangkunLI <shangkun.li@connect.ust.hk>
Date: Fri, 6 Feb 2026 15:17:11 +0800
Subject: [PATCH 4/5] sync cgrabench

---
 test/benchmark/CGRA-Bench        | 2 +-
 test/benchmark/Zeonica_Testbench | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/benchmark/CGRA-Bench b/test/benchmark/CGRA-Bench
index 2beecc59..f130a221 160000
--- a/test/benchmark/CGRA-Bench
+++ b/test/benchmark/CGRA-Bench
@@ -1 +1 @@
-Subproject commit 2beecc599bd268f8665344ba2271f48c97db7aa0
+Subproject commit f130a22161890f9f52e16651b2dd28008e4ff03f
diff --git a/test/benchmark/Zeonica_Testbench b/test/benchmark/Zeonica_Testbench
index 45e85e44..c7590d83 160000
--- a/test/benchmark/Zeonica_Testbench
+++ b/test/benchmark/Zeonica_Testbench
@@ -1 +1 @@
-Subproject commit 45e85e44d58670e08a88dfcebfd471909699ae2c
+Subproject commit c7590d836df404dca078c4c5104c39673100a4af

From fac94aedf28c879c668509c6b2949d29002084e3 Mon Sep 17 00:00:00 2001
From: ShangkunLI <shangkun.li@connect.ust.hk>
Date: Fri, 6 Feb 2026 15:22:17 +0800
Subject: [PATCH 5/5] update cgrabench

---
 test/benchmark/CGRA-Bench | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/benchmark/CGRA-Bench b/test/benchmark/CGRA-Bench
index f130a221..2b5e78b2 160000
--- a/test/benchmark/CGRA-Bench
+++ b/test/benchmark/CGRA-Bench
@@ -1 +1 @@
-Subproject commit f130a22161890f9f52e16651b2dd28008e4ff03f
+Subproject commit 2b5e78b24d481c8465c82672a8d5177a86119aed