Skip to content

Commit 15da4aa

Browse files
authored
[Spark] Only enable a single legacy feature with legacy metadata properties (#3657)
#### Which Delta project/connector is this regarding? - [X] Spark - [ ] Standalone - [ ] Flink - [ ] Kernel - [ ] Other (fill in here) ## Description Currently enabling legacy features on legacy protocols with metadata properties results to enabling all preceding legacy features. For example, enabling enableChangeDataFeed results to protocol (1, 4). This is inconsistent with the rest of the protocol operations. In this PR, we fix this inconsistency by always enabling only the requested feature. This is a behavioral change. ## How was this patch tested? Existing and new unit tests. ## Does this PR introduce _any_ user-facing changes? Yes. When enabling a feature using a table property, e.g. by setting `delta.enableChangeDataFeed` to `true`, then in the previous situation you would typically get protocol `(1, 4)`. Now you would get `(1, 7, changeDataFeed)`. The user can get `(1, 4)` by also asking for `delta.minWriterVersion = 4`. This change is OK now because (a) enabling fewer features is safer than enabling more features, and (b) Deletion Vectors requires table features support, and it is very popular to implement, so many clients have added support table features, (c) users can easily get back to the legacy protocol by ALTERing the protocol and asking for `delta.minWriterVersion = 4`. Signed-off-by: Bart Samwel <bart.samwel@databricks.com>
1 parent ac667ff commit 15da4aa

12 files changed

Lines changed: 200 additions & 153 deletions

File tree

sharing/src/test/scala/io/delta/sharing/spark/DeltaSharingTestSparkUtils.scala

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -85,7 +85,10 @@ trait DeltaSharingTestSparkUtils extends DeltaSQLTestUtils {
8585

8686
protected def createSimpleTable(tableName: String, enableCdf: Boolean): Unit = {
8787
val tablePropertiesStr = if (enableCdf) {
88-
"TBLPROPERTIES (delta.enableChangeDataFeed = true)"
88+
"""TBLPROPERTIES (
89+
|delta.minReaderVersion=1,
90+
|delta.minWriterVersion=4,
91+
|delta.enableChangeDataFeed = true)""".stripMargin
8992
} else {
9093
""
9194
}

spark/src/main/scala/org/apache/spark/sql/delta/actions/actions.scala

Lines changed: 18 additions & 66 deletions
Original file line numberDiff line numberDiff line change
@@ -322,12 +322,12 @@ object Protocol {
322322
*
323323
* This function returns the protocol versions and features individually instead of a
324324
* [[Protocol]], so the caller can identify the features that caused the protocol version. For
325-
* example, if the return values are (2, 5, columnMapping), the caller can safely ignore all
326-
* other features required by the protocol with a reader and writer version of 2 and 5.
325+
* example, if the return values are (2, 5, columnMapping + preceding features), the caller
326+
* can safely ignore all other features required by the protocol with a reader and writer
327+
* version of 2 and 5.
327328
*
328-
* Note that this method does not consider protocol versions and features configured in session
329-
* defaults. To make them effective, copy them to `metadata` using
330-
* [[DeltaConfigs.mergeGlobalConfigs]].
329+
* Note that this method does not consider features configured in session defaults.
330+
* To make them effective, copy them to `metadata` using [[DeltaConfigs.mergeGlobalConfigs]].
331331
*/
332332
def minProtocolComponentsFromMetadata(
333333
spark: SparkSession,
@@ -343,46 +343,11 @@ object Protocol {
343343
spark, metadata, Protocol().withFeatures(tablePropEnabledFeatures))
344344
val allEnabledFeatures = tablePropEnabledFeatures ++ metaEnabledFeatures
345345

346-
// Determine the min reader and writer version required by features in table properties or
347-
// metadata.
348-
// If any table property is specified:
349-
// we start from (3, 7) or (0, 7) depending on the existence of any writer-only feature.
350-
// If there's no table property:
351-
// if no feature is enabled or all features are legacy, we start from (0, 0);
352-
// if any feature is native and is reader-writer, we start from (3, 7);
353-
// otherwise we start from (0, 7) because there must exist a native writer-only feature.
354-
var (readerVersionFromFeatures, writerVersionFromFeatures) = {
355-
if (tablePropEnabledFeatures.exists(_.isReaderWriterFeature)) {
356-
(TABLE_FEATURES_MIN_READER_VERSION, TABLE_FEATURES_MIN_WRITER_VERSION)
357-
} else if (tablePropEnabledFeatures.nonEmpty) {
358-
(0, TABLE_FEATURES_MIN_WRITER_VERSION)
359-
} else if (metaEnabledFeatures.forall(_.isLegacyFeature)) { // also true for empty set
360-
(0, 0)
361-
} else if (metaEnabledFeatures.exists(f => !f.isLegacyFeature && f.isReaderWriterFeature)) {
362-
(TABLE_FEATURES_MIN_READER_VERSION, TABLE_FEATURES_MIN_WRITER_VERSION)
363-
} else {
364-
(0, TABLE_FEATURES_MIN_WRITER_VERSION)
365-
}
366-
}
367-
allEnabledFeatures.foreach { feature =>
368-
readerVersionFromFeatures = math.max(readerVersionFromFeatures, feature.minReaderVersion)
369-
writerVersionFromFeatures = math.max(writerVersionFromFeatures, feature.minWriterVersion)
370-
}
371-
372346
// Protocol version provided in table properties can upgrade the protocol, but only when they
373347
// are higher than which required by the enabled features.
374348
val (readerVersionFromTableConfOpt, writerVersionFromTableConfOpt) =
375349
getProtocolVersionsFromTableConf(tableConf)
376350

377-
// Decide the final protocol version:
378-
// a. 1, aka the lowest version possible
379-
// b. version required by manually enabled features and metadata features
380-
// c. version defined as table properties
381-
val finalReaderVersion =
382-
Seq(1, readerVersionFromFeatures, readerVersionFromTableConfOpt.getOrElse(0)).max
383-
val finalWriterVersion =
384-
Seq(1, writerVersionFromFeatures, writerVersionFromTableConfOpt.getOrElse(0)).max
385-
386351
// If the user explicitly sets the table versions, we need to take into account the
387352
// relevant implicit features.
388353
val implicitFeaturesFromTableConf =
@@ -399,7 +364,14 @@ object Protocol {
399364
case _ => Set.empty
400365
}
401366

402-
(finalReaderVersion, finalWriterVersion, allEnabledFeatures ++ implicitFeaturesFromTableConf)
367+
// Construct the minimum required protocol for the enabled features.
368+
val minProtocol = Protocol(TABLE_FEATURES_MIN_READER_VERSION, TABLE_FEATURES_MIN_WRITER_VERSION)
369+
.withFeatures(allEnabledFeatures ++ implicitFeaturesFromTableConf)
370+
.normalized
371+
372+
// Return the minimum protocol components.
373+
(minProtocol.minReaderVersion, minProtocol.minWriterVersion,
374+
minProtocol.implicitlyAndExplicitlySupportedFeatures)
403375
}
404376

405377
/**
@@ -488,32 +460,12 @@ object Protocol {
488460
spark: SparkSession,
489461
metadata: Metadata,
490462
current: Protocol): Option[Protocol] = {
491-
val (readerVersion, writerVersion, minRequiredFeatures) =
492-
minProtocolComponentsFromAutomaticallyEnabledFeatures(spark, metadata, current)
493-
494-
// If the user sets the protocol versions we need to take it account. In general,
495-
// enabling legacy features on legacy protocols results to pumping up the protocol
496-
// versions. However, setting table feature protocol versions while enabling
497-
// legacy features results to only enabling the requested features. For example:
498-
// 1) Create table with (1, 2), then ALTER TABLE with DeltaConfigs.CHANGE_DATA_FEED.key = true
499-
// results to (1, 4).
500-
// 2) Alternatively, Create table with (1, 2), then
501-
// ALTER TABLE set versions (1, 7) and DeltaConfigs.CHANGE_DATA_FEED.key = true results
502-
// to (1, 7, AppendOnly, Invariants, CDF).
503-
val readerVersionFromConf =
504-
Protocol.getReaderVersionFromTableConf(metadata.configuration).getOrElse(readerVersion)
505-
val writerVersionFromConf =
506-
Protocol.getWriterVersionFromTableConf(metadata.configuration).getOrElse(writerVersion)
507-
508-
val finalReaderVersion =
509-
Seq(readerVersion, readerVersionFromConf, current.minReaderVersion).max
510-
val finalWriterVersion =
511-
Seq(writerVersion, writerVersionFromConf, current.minWriterVersion).max
512-
513-
// Increment the reader and writer version to accurately add enabled legacy table features
514-
// either to the implicitly enabled table features or the table feature lists.
463+
515464
val required =
516-
Protocol(finalReaderVersion, finalWriterVersion).withFeatures(minRequiredFeatures)
465+
Protocol(TABLE_FEATURES_MIN_READER_VERSION, TABLE_FEATURES_MIN_WRITER_VERSION)
466+
.withFeatures(extractAutomaticallyEnabledFeatures(spark, metadata, current))
467+
.normalized
468+
517469
if (!required.canUpgradeTo(current)) {
518470
// When the current protocol does not satisfy metadata requirement, some additional features
519471
// must be supported by the protocol. We assert those features can actually perform the

spark/src/test/scala/org/apache/spark/sql/delta/DeltaCDCSQLSuite.scala

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -308,9 +308,16 @@ class DeltaCDCSQLSuite extends DeltaCDCSuiteBase with DeltaColumnMappingTestUtil
308308
// We set CDC to be enabled by default, so this should automatically bump the writer protocol
309309
// to the required version.
310310
if (columnMappingEnabled) {
311-
assert(log.snapshot.protocol == Protocol(2, 5))
311+
assert(log.update().protocol == Protocol(2, 7).withFeatures(Seq(
312+
AppendOnlyTableFeature,
313+
InvariantsTableFeature,
314+
ChangeDataFeedTableFeature,
315+
ColumnMappingTableFeature)))
312316
} else {
313-
assert(log.snapshot.protocol == Protocol(1, 4))
317+
assert(log.update().protocol == Protocol(1, 7).withFeatures(Seq(
318+
AppendOnlyTableFeature,
319+
InvariantsTableFeature,
320+
ChangeDataFeedTableFeature)))
314321
}
315322
}
316323
}

spark/src/test/scala/org/apache/spark/sql/delta/DeltaColumnMappingSuite.scala

Lines changed: 14 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -482,12 +482,13 @@ class DeltaColumnMappingSuite extends QueryTest
482482
expectedSchema: StructType,
483483
ignorePhysicalName: Boolean,
484484
mode: String,
485-
createNewTable: Boolean = true)(fn: => Unit): Unit = {
485+
createNewTable: Boolean = true,
486+
tableFeaturesProtocolExpected: Boolean = true)(fn: => Unit): Unit = {
486487
withTable(tableName) {
487488
fn
488489
checkProperties(tableName,
489490
readerVersion = 2,
490-
writerVersion = 5,
491+
writerVersion = if (tableFeaturesProtocolExpected) 7 else 5,
491492
mode = Some(mode),
492493
curMaxId = DeltaColumnMapping.findMaxColumnId(expectedSchema)
493494
)
@@ -826,7 +827,7 @@ class DeltaColumnMappingSuite extends QueryTest
826827
checkSchema("t1", schemaWithId)
827828
checkProperties("t1",
828829
readerVersion = 2,
829-
writerVersion = 5,
830+
writerVersion = 7,
830831
mode = Some(mode),
831832
curMaxId = DeltaColumnMapping.findMaxColumnId(schemaWithId)
832833
)
@@ -849,7 +850,7 @@ class DeltaColumnMappingSuite extends QueryTest
849850

850851
checkProperties("t1",
851852
readerVersion = 2,
852-
writerVersion = 5,
853+
writerVersion = 7,
853854
mode = Some(mode),
854855
curMaxId = DeltaColumnMapping.findMaxColumnId(schemaWithIdNested))
855856
checkSchema(
@@ -871,7 +872,7 @@ class DeltaColumnMappingSuite extends QueryTest
871872

872873
checkProperties("t1",
873874
readerVersion = 2,
874-
writerVersion = 5,
875+
writerVersion = 7,
875876
mode = Some(mode),
876877
curMaxId = curMaxId)
877878

@@ -886,7 +887,7 @@ class DeltaColumnMappingSuite extends QueryTest
886887
)
887888
checkProperties("t1",
888889
readerVersion = 2,
889-
writerVersion = 5,
890+
writerVersion = 7,
890891
mode = Some(mode),
891892
curMaxId = curMaxId2)
892893
checkSchema("t1",
@@ -938,7 +939,7 @@ class DeltaColumnMappingSuite extends QueryTest
938939

939940
checkProperties("t1",
940941
readerVersion = 2,
941-
writerVersion = 5,
942+
writerVersion = 7,
942943
mode = Some(mode),
943944
curMaxId = curMaxId)
944945
checkSchema("t1",
@@ -960,7 +961,7 @@ class DeltaColumnMappingSuite extends QueryTest
960961
)
961962
checkProperties("t1",
962963
readerVersion = 2,
963-
writerVersion = 5,
964+
writerVersion = 7,
964965
mode = Some(mode),
965966
curMaxId = curMaxId2)
966967
checkSchema("t1",
@@ -998,7 +999,7 @@ class DeltaColumnMappingSuite extends QueryTest
998999

9991000
checkProperties("t1",
10001001
readerVersion = 2,
1001-
writerVersion = 5,
1002+
writerVersion = 7,
10021003
mode = Some(mode),
10031004
curMaxId = curMaxId)
10041005
checkSchema("t1", schemaWithId)
@@ -1013,7 +1014,7 @@ class DeltaColumnMappingSuite extends QueryTest
10131014

10141015
checkProperties("t1",
10151016
readerVersion = 2,
1016-
writerVersion = 5,
1017+
writerVersion = 7,
10171018
mode = Some(mode),
10181019
curMaxId = curMaxId)
10191020

@@ -1037,7 +1038,7 @@ class DeltaColumnMappingSuite extends QueryTest
10371038
val curMaxId2 = DeltaColumnMapping.findMaxColumnId(schemaWithId) + 1
10381039
checkProperties("t1",
10391040
readerVersion = 2,
1040-
writerVersion = 5,
1041+
writerVersion = 7,
10411042
mode = Some(mode),
10421043
curMaxId = curMaxId2)
10431044
checkSchema("t1", schemaWithId.add("c", StringType, true, withId(3)))
@@ -1627,7 +1628,8 @@ class DeltaColumnMappingSuite extends QueryTest
16271628
schemaWithDottedColumnNames,
16281629
false,
16291630
"name",
1630-
createNewTable = false
1631+
createNewTable = false,
1632+
tableFeaturesProtocolExpected = false
16311633
) {
16321634
sql(s"CREATE TABLE t1 (${schemaWithDottedColumnNames.toDDL}) USING DELTA")
16331635
alterTableWithProps("t1", props = Map(

spark/src/test/scala/org/apache/spark/sql/delta/DeltaDDLUsingPathSuite.scala

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -169,25 +169,25 @@ trait DeltaDDLUsingPathTests extends QueryTest
169169
"key" -> "value")
170170
}
171171

172+
val protocol = Protocol.forNewTable(spark, Some(metadata))
173+
val supportedFeatures = protocol
174+
.readerAndWriterFeatureNames
175+
.map(name => s"delta.feature.$name" -> "supported")
176+
val expectedProperties = Seq(
177+
"delta.logRetentionDuration" -> "2 weeks",
178+
"delta.minReaderVersion" -> protocol.minReaderVersion.toString,
179+
"delta.minWriterVersion" -> protocol.minWriterVersion.toString,
180+
"key" -> "value") ++ supportedFeatures
181+
172182
checkDatasetUnorderly(
173183
dropColumnMappingConfigurations(
174184
sql(s"SHOW TBLPROPERTIES $table").as[(String, String)]),
175-
"delta.logRetentionDuration" -> "2 weeks",
176-
"delta.minReaderVersion" ->
177-
Protocol.forNewTable(spark, Some(metadata)).minReaderVersion.toString,
178-
"delta.minWriterVersion" ->
179-
Protocol.forNewTable(spark, Some(metadata)).minWriterVersion.toString,
180-
"key" -> "value")
185+
expectedProperties: _*)
181186

182187
checkDatasetUnorderly(
183188
dropColumnMappingConfigurations(
184189
sql(s"SHOW TBLPROPERTIES delta.`$path`").as[(String, String)]),
185-
"delta.logRetentionDuration" -> "2 weeks",
186-
"delta.minReaderVersion" ->
187-
Protocol.forNewTable(spark, Some(metadata)).minReaderVersion.toString,
188-
"delta.minWriterVersion" ->
189-
Protocol.forNewTable(spark, Some(metadata)).minWriterVersion.toString,
190-
"key" -> "value")
190+
expectedProperties: _*)
191191

192192
if (table == "`delta_test`") {
193193
val tableName = s"$catalogName.default.delta_test"

0 commit comments

Comments
 (0)