Skip to content

Commit 0e45ad2

Browse files
authored
[Spark] Remove dropped columns when running REORG PURGE (#3371)
<!-- Thanks for sending a pull request! Here are some tips for you: 1. If this is your first time, please read our contributor guidelines: https://github.com/delta-io/delta/blob/master/CONTRIBUTING.md 2. If the PR is unfinished, add '[WIP]' in your PR title, e.g., '[WIP] Your PR title ...'. 3. Be sure to keep the PR description updated to reflect all changes. 4. Please write your PR title to summarize what this PR proposes. 5. If possible, provide a concise example to reproduce the issue for a faster review. 6. If applicable, include the corresponding issue number in the PR title and link it in the body. --> #### Which Delta project/connector is this regarding? <!-- Please add the component selected below to the beginning of the pull request title For example: [Spark] Title of my pull request --> - [x] Spark - [ ] Standalone - [ ] Flink - [ ] Kernel - [ ] Other (fill in here) ## Description According to #3228, we add the support to also find and remove dropped columns when running `REORG PURGE`. Close #3228. <!-- - Describe what this PR changes. - Describe why we need the change. If this PR resolves an issue be sure to include "Resolves #XXX" to correctly link and close the issue upon merge. --> ## How was this patch tested? Through unit test in `DeltaReorgSuite.scala`. <!-- If tests were added, say they were added here. Please make sure to test the changes thoroughly including negative and positive cases if possible. If the changes were tested in any way other than unit tests, please clarify how you tested step by step (ideally copy and paste-able, so that other reviewers can test and check, and descendants can verify in the future). If the changes were not tested, please explain why. --> ## Does this PR introduce _any_ user-facing changes? No. <!-- If yes, please clarify the previous behavior and the change this PR proposes - provide the console output, description and/or an example to show the behavior difference if possible. If possible, please also clarify if this is a user-facing change compared to the released Delta Lake versions or within the unreleased branches such as master. If no, write 'No'. -->
1 parent 3797fe8 commit 0e45ad2

File tree

3 files changed

+215
-9
lines changed

3 files changed

+215
-9
lines changed

spark/src/main/scala/org/apache/spark/sql/delta/commands/DeltaReorgTableCommand.scala

Lines changed: 15 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -102,14 +102,25 @@ sealed trait DeltaReorgOperation {
102102

103103
/**
104104
* Reorg operation to purge files with soft deleted rows.
105+
* This operation will also try finding and removing the dropped columns from parquet files,
106+
* if ever exists such column that does not present in the current table schema.
105107
*/
106-
class DeltaPurgeOperation extends DeltaReorgOperation {
108+
class DeltaPurgeOperation extends DeltaReorgOperation with ReorgTableHelper {
107109
override def filterFilesToReorg(spark: SparkSession, snapshot: Snapshot, files: Seq[AddFile])
108-
: Seq[AddFile] =
109-
files.filter { file =>
110-
(file.deletionVector != null && file.numPhysicalRecords.isEmpty) ||
110+
: Seq[AddFile] = {
111+
val physicalSchema = DeltaColumnMapping.renameColumns(snapshot.schema)
112+
val protocol = snapshot.protocol
113+
val metadata = snapshot.metadata
114+
val filesWithDroppedColumns: Seq[AddFile] =
115+
filterParquetFilesOnExecutors(spark, files, snapshot, ignoreCorruptFiles = false) {
116+
schema => fileHasExtraColumns(schema, physicalSchema, protocol, metadata)
117+
}
118+
val filesWithDV: Seq[AddFile] = files.filter { file =>
119+
(file.deletionVector != null && file.numPhysicalRecords.isEmpty) ||
111120
file.numDeletedRecords > 0L
112121
}
122+
(filesWithDroppedColumns ++ filesWithDV).distinct
123+
}
113124
}
114125

115126
/**

spark/src/main/scala/org/apache/spark/sql/delta/commands/ReorgTableHelper.scala

Lines changed: 50 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -16,10 +16,10 @@
1616

1717
package org.apache.spark.sql.delta.commands
1818

19-
import org.apache.spark.sql.delta.Snapshot
20-
import org.apache.spark.sql.delta.actions.AddFile
19+
import org.apache.spark.sql.delta.{MaterializedRowCommitVersion, MaterializedRowId, Snapshot}
20+
import org.apache.spark.sql.delta.actions.{AddFile, Metadata, Protocol}
2121
import org.apache.spark.sql.delta.commands.VacuumCommand.generateCandidateFileMap
22-
import org.apache.spark.sql.delta.schema.SchemaMergingUtils
22+
import org.apache.spark.sql.delta.schema.{SchemaMergingUtils, SchemaUtils}
2323
import org.apache.spark.sql.delta.util.DeltaFileOperations
2424
import org.apache.hadoop.conf.Configuration
2525
import org.apache.hadoop.fs.{FileStatus, Path}
@@ -31,8 +31,12 @@ import org.apache.spark.util.SerializableConfiguration
3131

3232
trait ReorgTableHelper extends Serializable {
3333
/**
34-
* Determine whether `fileSchema` has any columns that has a type that differs from
34+
* Determine whether `fileSchema` has any column that has a type that differs from
3535
* `tablePhysicalSchema`.
36+
*
37+
* @param fileSchema the current parquet schema to be checked.
38+
* @param tablePhysicalSchema the current table schema.
39+
* @return whether the file has any column that has a different type from table column.
3640
*/
3741
protected def fileHasDifferentTypes(
3842
fileSchema: StructType,
@@ -46,6 +50,48 @@ trait ReorgTableHelper extends Serializable {
4650
false
4751
}
4852

53+
/**
54+
* Determine whether `fileSchema` has any column that does not exist in the
55+
* `tablePhysicalSchema`, this is possible by running ALTER TABLE commands,
56+
* e.g., ALTER TABLE DROP COLUMN.
57+
*
58+
* @param fileSchema the current parquet schema to be checked.
59+
* @param tablePhysicalSchema the current table schema.
60+
* @param protocol the protocol used to check `row_id` and `row_commit_version`.
61+
* @param metadata the metadata used to check `row_id` and `row_commit_version`.
62+
* @return whether the file has any dropped column.
63+
*/
64+
protected def fileHasExtraColumns(
65+
fileSchema: StructType,
66+
tablePhysicalSchema: StructType,
67+
protocol: Protocol,
68+
metadata: Metadata): Boolean = {
69+
// 0. get the materialized names for `row_id` and `row_commit_version`.
70+
val materializedRowIdColumnNameOpt =
71+
MaterializedRowId.getMaterializedColumnName(protocol, metadata)
72+
val materializedRowCommitVersionColumnNameOpt =
73+
MaterializedRowCommitVersion.getMaterializedColumnName(protocol, metadata)
74+
75+
SchemaMergingUtils.transformColumns(fileSchema) { (path, field, _) =>
76+
// 1. check whether the field exists in the `tablePhysicalSchema`.
77+
val fullName = path :+ field.name
78+
val inTableFieldOpt = SchemaUtils.findNestedFieldIgnoreCase(
79+
tablePhysicalSchema, fullName, includeCollections = true)
80+
81+
// 2. check whether the current `field` is `row_id` or `row_commit_version`
82+
// column; if so, we need to explicitly keep these columns since they are
83+
// not part of the table schema but exist in the parquet file.
84+
val isRowIdOrRowCommitVersion = materializedRowIdColumnNameOpt.contains(field.name) ||
85+
materializedRowCommitVersionColumnNameOpt.contains(field.name)
86+
87+
if (inTableFieldOpt.isEmpty && !isRowIdOrRowCommitVersion) {
88+
return true
89+
}
90+
field
91+
}
92+
false
93+
}
94+
4995
/**
5096
* Apply a filter on the list of AddFile to only keep the files that have physical parquet schema
5197
* that satisfies the given filter function.

spark/src/test/scala/org/apache/spark/sql/delta/optimize/DeltaReorgSuite.scala

Lines changed: 150 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,14 +16,20 @@
1616

1717
package org.apache.spark.sql.delta.optimize
1818

19-
import org.apache.spark.sql.delta.DeletionVectorsTestUtils
19+
import org.apache.spark.sql.delta.{DeletionVectorsTestUtils, DeltaColumnMapping, DeltaLog}
20+
import org.apache.spark.sql.delta.actions.AddFile
21+
import org.apache.spark.sql.delta.commands.VacuumCommand.generateCandidateFileMap
2022
import org.apache.spark.sql.delta.sources.DeltaSQLConf
2123
import org.apache.spark.sql.delta.test.DeltaSQLCommandTest
24+
import org.apache.spark.sql.delta.util.DeltaFileOperations
2225
import io.delta.tables.DeltaTable
26+
import org.apache.hadoop.fs.{FileStatus, Path}
27+
import org.apache.parquet.hadoop.Footer
2328

2429
import org.apache.spark.sql.QueryTest
2530
import org.apache.spark.sql.functions.col
2631
import org.apache.spark.sql.test.SharedSparkSession
32+
import org.apache.spark.util.SerializableConfiguration
2733

2834
class DeltaReorgSuite extends QueryTest
2935
with SharedSparkSession
@@ -150,4 +156,147 @@ class DeltaReorgSuite extends QueryTest
150156
// Because each deleted file has a DV associated it which gets rewritten as part of PURGE
151157
assert(opMetrics("numDeletionVectorsRemoved").toLong === numFilesRemoved)
152158
}
159+
160+
/**
161+
* Get all parquet footers for the input `files`, used only for testing.
162+
*
163+
* @param files the sequence of `AddFile` used to read the parquet footers
164+
* by the data file path in each `AddFile`.
165+
* @param log the delta log used to get the configuration and data path.
166+
* @return the sequence of the corresponding parquet footers, corresponds to
167+
* the sequence of `AddFile`.
168+
*/
169+
private def getParquetFooters(
170+
files: Seq[AddFile],
171+
log: DeltaLog): Seq[Footer] = {
172+
val serializedConf = new SerializableConfiguration(log.newDeltaHadoopConf())
173+
val dataPath = new Path(log.dataPath.toString)
174+
val nameToAddFileMap = generateCandidateFileMap(dataPath, files)
175+
val fileStatuses = nameToAddFileMap.map { case (absPath, addFile) =>
176+
new FileStatus(
177+
/* length */ addFile.size,
178+
/* isDir */ false,
179+
/* blockReplication */ 0,
180+
/* blockSize */ 1,
181+
/* modificationTime */ addFile.modificationTime,
182+
new Path(absPath)
183+
)
184+
}
185+
DeltaFileOperations.readParquetFootersInParallel(
186+
serializedConf.value,
187+
fileStatuses.toList,
188+
ignoreCorruptFiles = false
189+
)
190+
}
191+
192+
test("Purge dropped columns of a table without DV") {
193+
val targetDf = spark.range(0, 100, 1, numPartitions = 5)
194+
.withColumn("id_dropped", col("id") % 4)
195+
.toDF()
196+
withTempDeltaTable(targetDf) { (_, log) =>
197+
val path = log.dataPath.toString
198+
199+
val (addFiles1, _) = getFileActionsInLastVersion(log)
200+
assert(addFiles1.size === 5)
201+
val footers1 = getParquetFooters(addFiles1, log)
202+
footers1.foreach { footer =>
203+
val fields = footer.getParquetMetadata.getFileMetaData.getSchema.getFields
204+
assert(fields.size == 2)
205+
assert(fields.toArray.map { _.toString }.contains("optional int64 id_dropped"))
206+
}
207+
208+
// enable column-mapping first
209+
sql(
210+
s"""
211+
| ALTER TABLE delta.`$path`
212+
| SET TBLPROPERTIES (
213+
| 'delta.columnMapping.mode' = 'name'
214+
| )
215+
|""".stripMargin
216+
)
217+
// drop the extra column by alter table and run REORG PURGE
218+
sql(
219+
s"""
220+
| ALTER TABLE delta.`$path`
221+
| DROP COLUMN id_dropped
222+
|""".stripMargin
223+
)
224+
executePurge(path)
225+
226+
val (addFiles2, _) = getFileActionsInLastVersion(log)
227+
assert(addFiles2.size === 1)
228+
val footers2 = getParquetFooters(addFiles2, log)
229+
footers2.foreach { footer =>
230+
val fields = footer.getParquetMetadata.getFileMetaData.getSchema.getFields
231+
assert(fields.size == 1)
232+
assert(!fields.toArray.map { _.toString }.contains("optional int64 id_dropped"))
233+
}
234+
}
235+
}
236+
237+
test("Columns being renamed should not be purged") {
238+
val targetDf = spark.range(0, 100, 1, numPartitions = 5)
239+
.withColumn("id_before_rename", col("id") % 4)
240+
.withColumn("id_dropped", col("id") % 5)
241+
.toDF()
242+
withTempDeltaTable(targetDf) { (_, log) =>
243+
val path = log.dataPath.toString
244+
245+
val (addFiles1, _) = getFileActionsInLastVersion(log)
246+
assert(addFiles1.size === 5)
247+
val footers1 = getParquetFooters(addFiles1, log)
248+
footers1.foreach { footer =>
249+
val fields = footer.getParquetMetadata.getFileMetaData.getSchema.getFields
250+
assert(fields.size == 3)
251+
assert(fields.toArray.map { _.toString }.contains("optional int64 id_dropped"))
252+
assert(fields.toArray.map { _.toString }.contains("optional int64 id_before_rename"))
253+
}
254+
255+
// enable column-mapping first
256+
sql(
257+
s"""
258+
| ALTER TABLE delta.`$path`
259+
| SET TBLPROPERTIES (
260+
| 'delta.columnMapping.mode' = 'name'
261+
| )
262+
|""".stripMargin
263+
)
264+
// drop `id_dropped` and rename `id_before_rename` via alter table and run REORG PURGE,
265+
// this should remove `id_dropped` but keep `id_after_rename` in the parquet files.
266+
sql(
267+
s"""
268+
| ALTER TABLE delta.`$path`
269+
| DROP COLUMN id_dropped
270+
|""".stripMargin
271+
)
272+
sql(
273+
s"""
274+
| ALTER TABLE delta.`$path`
275+
| RENAME COLUMN id_before_rename TO id_after_rename
276+
|""".stripMargin
277+
)
278+
executePurge(path)
279+
280+
val tableSchema = log.update().schema
281+
val tablePhysicalSchema = DeltaColumnMapping.renameColumns(tableSchema)
282+
val beforeRenameColStr = "StructField(id_before_rename,LongType,true)"
283+
val afterRenameColStr = "StructField(id_after_rename,LongType,true)"
284+
assert(tableSchema.fields.length == 2 &&
285+
tableSchema.map { _.toString }.contains(afterRenameColStr))
286+
assert(tablePhysicalSchema.fields.length == 2 &&
287+
tablePhysicalSchema.map { _.toString }.contains(beforeRenameColStr))
288+
289+
val (addFiles2, _) = getFileActionsInLastVersion(log)
290+
assert(addFiles2.size === 1)
291+
val footers2 = getParquetFooters(addFiles2, log)
292+
footers2.foreach { footer =>
293+
val fields = footer.getParquetMetadata.getFileMetaData.getSchema.getFields
294+
assert(fields.size == 2)
295+
assert(!fields.toArray.map { _.toString }.contains("optional int64 id_dropped = 3"))
296+
// do note that the actual name for the column will not be
297+
// changed in parquet file level
298+
assert(fields.toArray.map { _.toString }.contains("optional int64 id_before_rename = 2"))
299+
}
300+
}
301+
}
153302
}

0 commit comments

Comments
 (0)