Avoid calling Dataset.isEmpty in vacuum

## Description `diff.isEmpty` may not be cheap. This PR removes it and updates the code to avoid using `Dataset.reduce`. ## Does this PR introduce _any_ user-facing changes? Closes #1306 Signed-off-by: Shixiong Zhu <zsxwing@gmail.com> GitOrigin-RevId: 6a43ee8643da724423e42b37c4f503b1a8024295
delta-io · Aug 11, 2022 · 7344149 · 7344149
1 parent 40943c6
commit 7344149
Showing 1 changed file with 1 addition and 4 deletions.
diff --git a/core/src/main/scala/org/apache/spark/sql/delta/commands/VacuumCommand.scala b/core/src/main/scala/org/apache/spark/sql/delta/commands/VacuumCommand.scala
@@ -318,15 +318,12 @@ trait VacuumCommandImpl extends DeltaCommand {
     import spark.implicits._
 
     if (parallel) {
-      // If there are no entries, do not call reduce as it results in empty collection error
-      if (diff.isEmpty) return 0
-
       diff.repartition(parallelPartitions).mapPartitions { files =>
         val fs = new Path(basePath).getFileSystem(hadoopConf.value.value)
         val filesDeletedPerPartition =
           files.map(p => stringToPath(p)).count(f => tryDeleteNonRecursive(fs, f))
         Iterator(filesDeletedPerPartition)
-      }.reduce(_ + _)
+      }.collect().sum
     } else {
       val fs = new Path(basePath).getFileSystem(hadoopConf.value.value)
       val fileResultSet = diff.toLocalIterator().asScala