Skip to content

Commit 8db9617

Browse files
johanl-dbvkorukanti
authored andcommitted
Fix inconsistent field metadata in MERGE
-Spark - [ ] Standalone - [ ] Flink - [ ] Kernel - [ ] Other (fill in here) Fixes an issue where internal metadata attached to fields leaks into the plan constructed when executing a MERGE command, causing the same attribute to appear both with and without the internal metadata. This can cause plan validation to fail due to the same attribute having two apparently different data types (metadata is part of a field datatype). Added test that would fail without the fix Closes #2612 GitOrigin-RevId: 50688200c53f9450512b76ee2d375b2e55db8216
1 parent cc1660b commit 8db9617

File tree

4 files changed

+46
-6
lines changed

4 files changed

+46
-6
lines changed

spark/src/main/scala/org/apache/spark/sql/delta/commands/MergeIntoCommandBase.scala

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -291,7 +291,7 @@ trait MergeIntoCommandBase extends LeafRunnableCommand
291291
fileIndex: TahoeFileIndex,
292292
columnsToDrop: Seq[String]): LogicalPlan = {
293293

294-
val targetOutputCols = getTargetOutputCols(deltaTxn)
294+
val targetOutputCols = getTargetOutputCols(spark, deltaTxn)
295295

296296
val plan = {
297297

@@ -339,8 +339,16 @@ trait MergeIntoCommandBase extends LeafRunnableCommand
339339
* this transaction, since new columns will have a value of null for all existing rows.
340340
*/
341341
protected def getTargetOutputCols(
342-
txn: OptimisticTransaction, makeNullable: Boolean = false): Seq[NamedExpression] = {
343-
txn.metadata.schema.map { col =>
342+
spark: SparkSession,
343+
txn: OptimisticTransaction,
344+
makeNullable: Boolean = false)
345+
: Seq[NamedExpression] = {
346+
// Internal metadata attached to the table schema must not leak into the the target plan to
347+
// prevent inconsistencies - e.p. metadata matters when comparing data type of struct with
348+
// nested fields.
349+
val schema = DeltaColumnMapping.dropColumnMappingMetadata(
350+
DeltaTableUtils.removeInternalMetadata(spark, txn.metadata.schema))
351+
schema.map { col =>
344352
targetOutputAttributesMap
345353
.get(col.name)
346354
.map { a =>

spark/src/main/scala/org/apache/spark/sql/delta/commands/merge/ClassicMergeExecutor.scala

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -388,7 +388,7 @@ trait ClassicMergeExecutor extends MergeOutputGeneration {
388388

389389
// The target output columns need to be marked as nullable here, as they are going to be used
390390
// to reference the output of an outer join.
391-
val targetOutputCols = getTargetOutputCols(deltaTxn, makeNullable = true)
391+
val targetOutputCols = getTargetOutputCols(spark, deltaTxn, makeNullable = true)
392392

393393
// If there are N columns in the target table, the full outer join output will have:
394394
// - N columns for target table

spark/src/main/scala/org/apache/spark/sql/delta/commands/merge/InsertOnlyMergeExecutor.scala

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -96,7 +96,7 @@ trait InsertOnlyMergeExecutor extends MergeOutputGeneration {
9696
sourceDF
9797
}
9898

99-
val outputDF = generateInsertsOnlyOutputDF(preparedSourceDF, deltaTxn)
99+
val outputDF = generateInsertsOnlyOutputDF(spark, preparedSourceDF, deltaTxn)
100100
logDebug(s"$extraOpType: output plan:\n" + outputDF.queryExecution)
101101

102102
val newFiles = writeFiles(spark, deltaTxn, outputDF)
@@ -142,10 +142,11 @@ trait InsertOnlyMergeExecutor extends MergeOutputGeneration {
142142
* and when there are multiple insert clauses.
143143
*/
144144
private def generateInsertsOnlyOutputDF(
145+
spark: SparkSession,
145146
preparedSourceDF: DataFrame,
146147
deltaTxn: OptimisticTransaction): DataFrame = {
147148

148-
val targetOutputColNames = getTargetOutputCols(deltaTxn).map(_.name)
149+
val targetOutputColNames = getTargetOutputCols(spark, deltaTxn).map(_.name)
149150

150151
// When there is only one insert clause, there is no need for ROW_DROPPED_COL and
151152
// output df can be generated without CaseWhen.

spark/src/test/scala/org/apache/spark/sql/delta/MergeIntoSuiteBase.scala

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3154,6 +3154,37 @@ abstract class MergeIntoSuiteBase
31543154
expectedErrorMsgForDataSetTempView = null
31553155
)
31563156

3157+
test("merge correctly handle field metadata") {
3158+
withTable("source", "target") {
3159+
// Create a target table with user metadata (comments) and internal metadata (column mapping
3160+
// information) on both a top-level column and a nested field.
3161+
sql(
3162+
"""
3163+
|CREATE TABLE target(
3164+
| key int not null COMMENT 'data column',
3165+
| value int not null,
3166+
| cstruct struct<foo int COMMENT 'foo field'>)
3167+
|USING DELTA
3168+
|TBLPROPERTIES (
3169+
| 'delta.minReaderVersion' = '2',
3170+
| 'delta.minWriterVersion' = '5',
3171+
| 'delta.columnMapping.mode' = 'name')
3172+
""".stripMargin
3173+
)
3174+
sql(s"INSERT INTO target VALUES (0, 0, null)")
3175+
3176+
sql("CREATE TABLE source (key int not null, value int not null) USING DELTA")
3177+
sql(s"INSERT INTO source VALUES (1, 1)")
3178+
3179+
executeMerge(
3180+
tgt = "target",
3181+
src = "source",
3182+
cond = "source.key = target.key",
3183+
update(condition = "target.key = 1", set = "target.value = 42"),
3184+
updateNotMatched(condition = "target.key = 100", set = "target.value = 22"))
3185+
}
3186+
}
3187+
31573188
test("UDT Data Types - simple and nested") {
31583189
withTable("source") {
31593190
withTable("target") {

0 commit comments

Comments
 (0)