[Spark] Remove dropped columns when running REORG PURGE (#3371)

xzhseh · web-flow · commit 0e45ad2b75d7 · 2024-07-18T11:12:23.000-10:00
#### Which Delta project/connector is this regarding?  - [x] Spark - [ ] Standalone - [ ] Flink - [ ] Kernel - [ ] Other (fill in here) ## Description According to #3228, we add the support to also find and remove dropped columns when running `REORG PURGE`. Close #3228.  ## How was this patch tested? Through unit test in `DeltaReorgSuite.scala`.  ## Does this PR introduce _any_ user-facing changes? No.
diff --git a/spark/src/main/scala/org/apache/spark/sql/delta/commands/DeltaReorgTableCommand.scala b/spark/src/main/scala/org/apache/spark/sql/delta/commands/DeltaReorgTableCommand.scala
@@ -102,14 +102,25 @@ sealed trait DeltaReorgOperation {
 
 /**
  * Reorg operation to purge files with soft deleted rows.
+ * This operation will also try finding and removing the dropped columns from parquet files,
+ * if ever exists such column that does not present in the current table schema.
  */
-class DeltaPurgeOperation extends DeltaReorgOperation {
+class DeltaPurgeOperation extends DeltaReorgOperation with ReorgTableHelper {
   override def filterFilesToReorg(spark: SparkSession, snapshot: Snapshot, files: Seq[AddFile])
-    : Seq[AddFile] =
-    files.filter { file =>
-      (file.deletionVector != null && file.numPhysicalRecords.isEmpty) ||
+    : Seq[AddFile] = {
+    val physicalSchema = DeltaColumnMapping.renameColumns(snapshot.schema)
+    val protocol = snapshot.protocol
+    val metadata = snapshot.metadata
+    val filesWithDroppedColumns: Seq[AddFile] =
+      filterParquetFilesOnExecutors(spark, files, snapshot, ignoreCorruptFiles = false) {
+        schema => fileHasExtraColumns(schema, physicalSchema, protocol, metadata)
+      }
+    val filesWithDV: Seq[AddFile] = files.filter { file =>
+        (file.deletionVector != null && file.numPhysicalRecords.isEmpty) ||
         file.numDeletedRecords > 0L
     }
+    (filesWithDroppedColumns ++ filesWithDV).distinct
+  }
 }
 
 /**
diff --git a/spark/src/main/scala/org/apache/spark/sql/delta/commands/ReorgTableHelper.scala b/spark/src/main/scala/org/apache/spark/sql/delta/commands/ReorgTableHelper.scala
@@ -16,10 +16,10 @@
 
 package org.apache.spark.sql.delta.commands
 
-import org.apache.spark.sql.delta.Snapshot
-import org.apache.spark.sql.delta.actions.AddFile
+import org.apache.spark.sql.delta.{MaterializedRowCommitVersion, MaterializedRowId, Snapshot}
+import org.apache.spark.sql.delta.actions.{AddFile, Metadata, Protocol}
 import org.apache.spark.sql.delta.commands.VacuumCommand.generateCandidateFileMap
-import org.apache.spark.sql.delta.schema.SchemaMergingUtils
+import org.apache.spark.sql.delta.schema.{SchemaMergingUtils, SchemaUtils}
 import org.apache.spark.sql.delta.util.DeltaFileOperations
 import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.fs.{FileStatus, Path}
@@ -31,8 +31,12 @@ import org.apache.spark.util.SerializableConfiguration
 
 trait ReorgTableHelper extends Serializable {
   /**
-   * Determine whether `fileSchema` has any columns that has a type that differs from
+   * Determine whether `fileSchema` has any column that has a type that differs from
    * `tablePhysicalSchema`.
+   *
+   * @param fileSchema the current parquet schema to be checked.
+   * @param tablePhysicalSchema the current table schema.
+   * @return whether the file has any column that has a different type from table column.
    */
   protected def fileHasDifferentTypes(
       fileSchema: StructType,
@@ -46,6 +50,48 @@ trait ReorgTableHelper extends Serializable {
     false
   }
 
+  /**
+   * Determine whether `fileSchema` has any column that does not exist in the
+   * `tablePhysicalSchema`, this is possible by running ALTER TABLE commands,
+   * e.g., ALTER TABLE DROP COLUMN.
+   *
+   * @param fileSchema the current parquet schema to be checked.
+   * @param tablePhysicalSchema the current table schema.
+   * @param protocol the protocol used to check `row_id` and `row_commit_version`.
+   * @param metadata the metadata used to check `row_id` and `row_commit_version`.
+   * @return whether the file has any dropped column.
+   */
+  protected def fileHasExtraColumns(
+      fileSchema: StructType,
+      tablePhysicalSchema: StructType,
+      protocol: Protocol,
+      metadata: Metadata): Boolean = {
+    // 0. get the materialized names for `row_id` and `row_commit_version`.
+    val materializedRowIdColumnNameOpt =
+      MaterializedRowId.getMaterializedColumnName(protocol, metadata)
+    val materializedRowCommitVersionColumnNameOpt =
+      MaterializedRowCommitVersion.getMaterializedColumnName(protocol, metadata)
+
+    SchemaMergingUtils.transformColumns(fileSchema) { (path, field, _) =>
+      // 1. check whether the field exists in the `tablePhysicalSchema`.
+      val fullName = path :+ field.name
+      val inTableFieldOpt = SchemaUtils.findNestedFieldIgnoreCase(
+        tablePhysicalSchema, fullName, includeCollections = true)
+
+      // 2. check whether the current `field` is `row_id` or `row_commit_version`
+      //    column; if so, we need to explicitly keep these columns since they are
+      //    not part of the table schema but exist in the parquet file.
+      val isRowIdOrRowCommitVersion = materializedRowIdColumnNameOpt.contains(field.name) ||
+        materializedRowCommitVersionColumnNameOpt.contains(field.name)
+
+      if (inTableFieldOpt.isEmpty && !isRowIdOrRowCommitVersion) {
+        return true
+      }
+      field
+    }
+    false
+  }
+
   /**
    * Apply a filter on the list of AddFile to only keep the files that have physical parquet schema
    * that satisfies the given filter function.
diff --git a/spark/src/test/scala/org/apache/spark/sql/delta/optimize/DeltaReorgSuite.scala b/spark/src/test/scala/org/apache/spark/sql/delta/optimize/DeltaReorgSuite.scala
@@ -16,14 +16,20 @@
 
 package org.apache.spark.sql.delta.optimize
 
-import org.apache.spark.sql.delta.DeletionVectorsTestUtils
+import org.apache.spark.sql.delta.{DeletionVectorsTestUtils, DeltaColumnMapping, DeltaLog}
+import org.apache.spark.sql.delta.actions.AddFile
+import org.apache.spark.sql.delta.commands.VacuumCommand.generateCandidateFileMap
 import org.apache.spark.sql.delta.sources.DeltaSQLConf
 import org.apache.spark.sql.delta.test.DeltaSQLCommandTest
+import org.apache.spark.sql.delta.util.DeltaFileOperations
 import io.delta.tables.DeltaTable
+import org.apache.hadoop.fs.{FileStatus, Path}
+import org.apache.parquet.hadoop.Footer
 
 import org.apache.spark.sql.QueryTest
 import org.apache.spark.sql.functions.col
 import org.apache.spark.sql.test.SharedSparkSession
+import org.apache.spark.util.SerializableConfiguration
 
 class DeltaReorgSuite extends QueryTest
   with SharedSparkSession
@@ -150,4 +156,147 @@ class DeltaReorgSuite extends QueryTest
     // Because each deleted file has a DV associated it which gets rewritten as part of PURGE
     assert(opMetrics("numDeletionVectorsRemoved").toLong === numFilesRemoved)
   }
+
+  /**
+   * Get all parquet footers for the input `files`, used only for testing.
+   *
+   * @param files the sequence of `AddFile` used to read the parquet footers
+   *              by the data file path in each `AddFile`.
+   * @param log the delta log used to get the configuration and data path.
+   * @return the sequence of the corresponding parquet footers, corresponds to
+   *         the sequence of `AddFile`.
+   */
+  private def getParquetFooters(
+      files: Seq[AddFile],
+      log: DeltaLog): Seq[Footer] = {
+    val serializedConf = new SerializableConfiguration(log.newDeltaHadoopConf())
+    val dataPath = new Path(log.dataPath.toString)
+    val nameToAddFileMap = generateCandidateFileMap(dataPath, files)
+    val fileStatuses = nameToAddFileMap.map { case (absPath, addFile) =>
+      new FileStatus(
+        /* length */ addFile.size,
+        /* isDir */ false,
+        /* blockReplication */ 0,
+        /* blockSize */ 1,
+        /* modificationTime */ addFile.modificationTime,
+        new Path(absPath)
+      )
+    }
+    DeltaFileOperations.readParquetFootersInParallel(
+      serializedConf.value,
+      fileStatuses.toList,
+      ignoreCorruptFiles = false
+    )
+  }
+
+  test("Purge dropped columns of a table without DV") {
+    val targetDf = spark.range(0, 100, 1, numPartitions = 5)
+      .withColumn("id_dropped", col("id") % 4)
+      .toDF()
+    withTempDeltaTable(targetDf) { (_, log) =>
+      val path = log.dataPath.toString
+
+      val (addFiles1, _) = getFileActionsInLastVersion(log)
+      assert(addFiles1.size === 5)
+      val footers1 = getParquetFooters(addFiles1, log)
+      footers1.foreach { footer =>
+        val fields = footer.getParquetMetadata.getFileMetaData.getSchema.getFields
+        assert(fields.size == 2)
+        assert(fields.toArray.map { _.toString }.contains("optional int64 id_dropped"))
+      }
+
+      // enable column-mapping first
+      sql(
+        s"""
+           | ALTER TABLE delta.`$path`
+           | SET TBLPROPERTIES (
+           |   'delta.columnMapping.mode' = 'name'
+           | )
+           |""".stripMargin
+      )
+      // drop the extra column by alter table and run REORG PURGE
+      sql(
+        s"""
+           | ALTER TABLE delta.`$path`
+           | DROP COLUMN id_dropped
+           |""".stripMargin
+      )
+      executePurge(path)
+
+      val (addFiles2, _) = getFileActionsInLastVersion(log)
+      assert(addFiles2.size === 1)
+      val footers2 = getParquetFooters(addFiles2, log)
+      footers2.foreach { footer =>
+        val fields = footer.getParquetMetadata.getFileMetaData.getSchema.getFields
+        assert(fields.size == 1)
+        assert(!fields.toArray.map { _.toString }.contains("optional int64 id_dropped"))
+      }
+    }
+  }
+
+  test("Columns being renamed should not be purged") {
+    val targetDf = spark.range(0, 100, 1, numPartitions = 5)
+      .withColumn("id_before_rename", col("id") % 4)
+      .withColumn("id_dropped", col("id") % 5)
+      .toDF()
+    withTempDeltaTable(targetDf) { (_, log) =>
+      val path = log.dataPath.toString
+
+      val (addFiles1, _) = getFileActionsInLastVersion(log)
+      assert(addFiles1.size === 5)
+      val footers1 = getParquetFooters(addFiles1, log)
+      footers1.foreach { footer =>
+        val fields = footer.getParquetMetadata.getFileMetaData.getSchema.getFields
+        assert(fields.size == 3)
+        assert(fields.toArray.map { _.toString }.contains("optional int64 id_dropped"))
+        assert(fields.toArray.map { _.toString }.contains("optional int64 id_before_rename"))
+      }
+
+      // enable column-mapping first
+      sql(
+        s"""
+           | ALTER TABLE delta.`$path`
+           | SET TBLPROPERTIES (
+           |   'delta.columnMapping.mode' = 'name'
+           | )
+           |""".stripMargin
+      )
+      // drop `id_dropped` and rename `id_before_rename` via alter table and run REORG PURGE,
+      // this should remove `id_dropped` but keep `id_after_rename` in the parquet files.
+      sql(
+        s"""
+           | ALTER TABLE delta.`$path`
+           | DROP COLUMN id_dropped
+           |""".stripMargin
+      )
+      sql(
+        s"""
+           | ALTER TABLE delta.`$path`
+           | RENAME COLUMN id_before_rename TO id_after_rename
+           |""".stripMargin
+      )
+      executePurge(path)
+
+      val tableSchema = log.update().schema
+      val tablePhysicalSchema = DeltaColumnMapping.renameColumns(tableSchema)
+      val beforeRenameColStr = "StructField(id_before_rename,LongType,true)"
+      val afterRenameColStr = "StructField(id_after_rename,LongType,true)"
+      assert(tableSchema.fields.length == 2 &&
+        tableSchema.map { _.toString }.contains(afterRenameColStr))
+      assert(tablePhysicalSchema.fields.length == 2 &&
+        tablePhysicalSchema.map { _.toString }.contains(beforeRenameColStr))
+
+      val (addFiles2, _) = getFileActionsInLastVersion(log)
+      assert(addFiles2.size === 1)
+      val footers2 = getParquetFooters(addFiles2, log)
+      footers2.foreach { footer =>
+        val fields = footer.getParquetMetadata.getFileMetaData.getSchema.getFields
+        assert(fields.size == 2)
+        assert(!fields.toArray.map { _.toString }.contains("optional int64 id_dropped = 3"))
+        // do note that the actual name for the column will not be
+        // changed in parquet file level
+        assert(fields.toArray.map { _.toString }.contains("optional int64 id_before_rename = 2"))
+      }
+    }
+  }
 }