Skip to content

Commit 3441df1

Browse files
jackierwzhangvkorukanti
authored andcommitted
Support non-additive Delta source schema evolution with schema tracking log.
Closes #1690 GitOrigin-RevId: 5b06490b5bb16ea1f92f5e68d67674537ab7cb24
1 parent c87ea1d commit 3441df1

17 files changed

+1892
-331
lines changed

core/src/main/resources/error/delta-error-classes.json

+32-8
Original file line numberDiff line numberDiff line change
@@ -80,14 +80,6 @@
8080
],
8181
"sqlState" : "42KD4"
8282
},
83-
"DELTA_BLOCK_COLUMN_MAPPING_SCHEMA_INCOMPATIBLE_OPERATION" : {
84-
"message" : [
85-
"<opName> is not supported on tables with read-incompatible schema changes (e.g. rename or drop or datatype changes).",
86-
"Read schema: <readSchema>. Incompatible data schema: <incompatibleSchema>.",
87-
"Although strongly not recommended, you may also force ignore the schema checks during <opName> at your own risk of potentially incorrect results by turning on the SQL conf <escapeConfig>."
88-
],
89-
"sqlState" : "42KD4"
90-
},
9183
"DELTA_BLOOM_FILTER_DROP_ON_NON_EXISTING_COLUMNS" : {
9284
"message" : [
9385
"Cannot drop bloom filter indices for the following non-existent column(s): <unknownColumns>"
@@ -1571,6 +1563,30 @@
15711563
],
15721564
"sqlState" : "KD002"
15731565
},
1566+
"DELTA_STREAMING_INCOMPATIBLE_SCHEMA_CHANGE" : {
1567+
"message" : [
1568+
"Streaming read is not supported on tables with read-incompatible schema changes (e.g. rename or drop or datatype changes).",
1569+
"Read schema: <readSchema>. Incompatible data schema: <incompatibleSchema>."
1570+
],
1571+
"sqlState" : "42KD4"
1572+
},
1573+
"DELTA_STREAMING_INCOMPATIBLE_SCHEMA_CHANGE_USE_SCHEMA_LOG" : {
1574+
"message" : [
1575+
"Streaming read is not supported on tables with read-incompatible schema changes (e.g. rename or drop or datatype changes).",
1576+
"Read schema: <readSchema>. Incompatible data schema: <incompatibleSchema>.",
1577+
"Please provide a 'schemaTrackingLocation' to enable non-additive schema evolution for Delta stream processing.",
1578+
"See <docLink> for more details."
1579+
],
1580+
"sqlState" : "42KD4"
1581+
},
1582+
"DELTA_STREAMING_SCHEMA_EVOLUTION" : {
1583+
"message" : [
1584+
"The schema of your Delta table has changed during streaming, and the schema tracking log has been updated",
1585+
"Please restart the stream to continue processing using the updated schema:",
1586+
"<schema>"
1587+
],
1588+
"sqlState" : "22000"
1589+
},
15741590
"DELTA_STREAMING_SCHEMA_LOCATION_CONFLICT" : {
15751591
"message" : [
15761592
"Detected conflicting schema location '<loc>' while streaming from table or table located at '<table>'.",
@@ -1608,6 +1624,14 @@
16081624
],
16091625
"sqlState" : "22000"
16101626
},
1627+
"DELTA_STREAMING_SCHEMA_LOG_INIT_FAILED_INCOMPATIBLE_SCHEMA" : {
1628+
"message" : [
1629+
"We could not initialize the Delta streaming source schema log with a valid schema because",
1630+
"we detected an incompatible schema change while serving a streaming batch from table version <a> to <b>.",
1631+
"To continue processing the stream with latest schema, please turn on <config>."
1632+
],
1633+
"sqlState" : "22000"
1634+
},
16111635
"DELTA_STREAMING_SCHEMA_LOG_PARSE_SCHEMA_FAILED" : {
16121636
"message" : [
16131637
"Failed to parse the schema from the Delta streaming source schema log.",

core/src/main/scala/org/apache/spark/sql/delta/DeltaAnalysis.scala

+1-1
Original file line numberDiff line numberDiff line change
@@ -856,7 +856,7 @@ class DeltaAnalysis(session: SparkSession)
856856
assert(options.get("path").isDefined, "Path for Delta table must be defined")
857857
val log = DeltaLog.forTable(session, options.get("path").get)
858858
val sourceIdOpt = options.get(DeltaOptions.STREAMING_SOURCE_TRACKING_ID)
859-
val schemaTrackingLocation = DeltaSourceSchemaLog.fullSchemaTrackingLocation(
859+
val schemaTrackingLocation = DeltaSourceSchemaTrackingLog.fullSchemaTrackingLocation(
860860
rootSchemaTrackingLocation, log.tableId, sourceIdOpt)
861861
// Make sure schema location is under checkpoint
862862
if (!allowSchemaLocationOutsideOfCheckpoint &&

core/src/main/scala/org/apache/spark/sql/delta/DeltaErrors.scala

+34-12
Original file line numberDiff line numberDiff line change
@@ -2552,16 +2552,18 @@ trait DeltaErrorsBase
25522552
)
25532553
}
25542554

2555-
def blockStreamingReadsOnColumnMappingEnabledTable(
2555+
def blockStreamingReadsWithIncompatibleColumnMappingSchemaChanges(
2556+
spark: SparkSession,
25562557
readSchema: StructType,
25572558
incompatibleSchema: StructType,
2558-
isCdfRead: Boolean,
25592559
detectedDuringStreaming: Boolean): Throwable = {
2560-
new DeltaColumnMappingUnsupportedSchemaIncompatibleException(
2561-
if (isCdfRead) "Streaming read of Change Data Feed (CDF)" else "Streaming read",
2560+
val enableNonAdditiveSchemaEvolution = spark.sessionState.conf.getConf(
2561+
DeltaSQLConf.DELTA_STREAMING_ENABLE_NON_ADDITIVE_SCHEMA_EVOLUTION)
2562+
new DeltaStreamingColumnMappingSchemaIncompatibleException(
25622563
readSchema,
25632564
incompatibleSchema,
2564-
DeltaSQLConf.DELTA_STREAMING_UNSAFE_READ_ON_INCOMPATIBLE_COLUMN_MAPPING_SCHEMA_CHANGES.key,
2565+
"",
2566+
enableNonAdditiveSchemaEvolution,
25652567
additionalProperties = Map(
25662568
"detectedDuringStreaming" -> detectedDuringStreaming.toString
25672569
))
@@ -2635,6 +2637,24 @@ trait DeltaErrorsBase
26352637
cause = cause)
26362638
}
26372639

2640+
def streamingSchemaEvolutionException(newSchema: StructType): Throwable = {
2641+
new DeltaRuntimeException(
2642+
errorClass = "DELTA_STREAMING_SCHEMA_EVOLUTION",
2643+
messageParameters = Array(formatSchema(newSchema)))
2644+
}
2645+
2646+
def streamingSchemaLogInitFailedIncompatibleSchemaException(
2647+
startVersion: Long,
2648+
endVersion: Long): Throwable = {
2649+
new DeltaRuntimeException(
2650+
errorClass = "DELTA_STREAMING_SCHEMA_LOG_INIT_FAILED_INCOMPATIBLE_SCHEMA",
2651+
messageParameters = Array(
2652+
startVersion.toString, endVersion.toString,
2653+
DeltaSQLConf.
2654+
DELTA_STREAMING_UNSAFE_READ_ON_INCOMPATIBLE_SCHEMA_CHANGES_DURING_STREAM_START.key)
2655+
)
2656+
}
2657+
26382658
def failToDeserializeSchemaLog(location: String): Throwable = {
26392659
new DeltaRuntimeException(
26402660
errorClass = "DELTA_STREAMING_SCHEMA_LOG_DESERIALIZE_FAILED",
@@ -3076,18 +3096,20 @@ class DeltaChecksumException(
30763096
* To make compatible with existing behavior for those who accidentally has already used this
30773097
* operation, user should always be able to use `escapeConfigName` to fall back at own risk.
30783098
*/
3079-
class DeltaColumnMappingUnsupportedSchemaIncompatibleException(
3080-
val opName: String,
3099+
class DeltaStreamingColumnMappingSchemaIncompatibleException(
30813100
val readSchema: StructType,
30823101
val incompatibleSchema: StructType,
3083-
val escapeConfigName: String,
3102+
val docLink: String,
3103+
val enableNonAdditiveSchemaEvolution: Boolean = false,
30843104
val additionalProperties: Map[String, String] = Map.empty)
30853105
extends DeltaUnsupportedOperationException(
3086-
errorClass = "DELTA_BLOCK_COLUMN_MAPPING_SCHEMA_INCOMPATIBLE_OPERATION",
3106+
errorClass = if (enableNonAdditiveSchemaEvolution) {
3107+
"DELTA_STREAMING_INCOMPATIBLE_SCHEMA_CHANGE_USE_SCHEMA_LOG"
3108+
} else {
3109+
"DELTA_STREAMING_INCOMPATIBLE_SCHEMA_CHANGE"
3110+
},
30873111
messageParameters = Array(
3088-
opName,
30893112
readSchema.json,
30903113
incompatibleSchema.json,
3091-
opName,
3092-
escapeConfigName)
3114+
docLink)
30933115
)

core/src/main/scala/org/apache/spark/sql/delta/DeltaLog.scala

+9-11
Original file line numberDiff line numberDiff line change
@@ -457,18 +457,19 @@ class DeltaLog private(
457457
* Returns a [[org.apache.spark.sql.DataFrame]] containing the new files within the specified
458458
* version range.
459459
*
460-
* It can optionally take a customReadSchema which consists of the actual read schema to read
461-
* the files. This is used to support non-additive Delta Source streaming schema evolution.
462-
* The customReadSchema requires that its partitionSchema for the Delta table does not change from
463-
* the snapshot's partitionSchema.
460+
* @param customDataSchema Optional data schema that will be used to read the files.
461+
* This is used when reading multiple snapshots using one all-encompassing
462+
* schema, e.g. during streaming.
463+
* This parameter only modifies the data schema. The partition schema is
464+
* not updated, so the caller should ensure that it does not change
465+
* compared to the snapshot.
464466
*/
465467
def createDataFrame(
466468
snapshot: Snapshot,
467469
addFiles: Seq[AddFile],
468470
isStreaming: Boolean = false,
469471
actionTypeOpt: Option[String] = None,
470-
customReadSchema: Option[PersistedSchema] = None
471-
): DataFrame = {
472+
customDataSchema: Option[StructType] = None): DataFrame = {
472473
val actionType = actionTypeOpt.getOrElse(if (isStreaming) "streaming" else "batch")
473474
// It's ok to not pass down the partitionSchema to TahoeBatchFileIndex. Schema evolution will
474475
// ensure any partitionSchema changes will be captured, and upon restart, the new snapshot will
@@ -479,13 +480,10 @@ class DeltaLog private(
479480
val partitionSchema = snapshot.metadata.partitionSchema
480481
var metadata = snapshot.metadata
481482

482-
require(customReadSchema.forall(_.partitionSchema == partitionSchema),
483-
"Cannot specify a custom read schema with different partition schema than the Delta table")
484-
485483
// Replace schema inside snapshot metadata so that later `fileFormat()` can generate the correct
486-
// DeltaParquetFormat with the correct schema to references, the customReadSchema should also
484+
// DeltaParquetFormat with the correct schema to references, the customDataSchema should also
487485
// contain the correct column mapping metadata if needed after being loaded from schema log.
488-
customReadSchema.map(_.dataSchema).foreach { readSchema =>
486+
customDataSchema.foreach { readSchema =>
489487
metadata = snapshot.metadata.copy(schemaString = readSchema.json)
490488
}
491489

core/src/main/scala/org/apache/spark/sql/delta/DeltaOptions.scala

+5
Original file line numberDiff line numberDiff line change
@@ -271,6 +271,10 @@ object DeltaOptions extends DeltaLogging {
271271
* An option to allow column mapping enabled tables to conduct schema evolution during streaming
272272
*/
273273
val SCHEMA_TRACKING_LOCATION = "schemaTrackingLocation"
274+
/**
275+
* Alias for `schemaTrackingLocation`, so users familiar with AutoLoader can migrate easily.
276+
*/
277+
val SCHEMA_TRACKING_LOCATION_ALIAS = "schemaLocation"
274278
/**
275279
* An option to instruct DeltaSource to pick a customized subdirectory for schema log in case of
276280
* rare conflicts such as when a stream needs to do a self-union of two Delta sources from the
@@ -307,6 +311,7 @@ object DeltaOptions extends DeltaLogging {
307311
TXN_APP_ID,
308312
TXN_VERSION,
309313
SCHEMA_TRACKING_LOCATION,
314+
SCHEMA_TRACKING_LOCATION_ALIAS,
310315
STREAMING_SOURCE_TRACKING_ID,
311316
"queryName",
312317
"checkpointLocation",

core/src/main/scala/org/apache/spark/sql/delta/actions/actions.scala

+3
Original file line numberDiff line numberDiff line change
@@ -1079,6 +1079,9 @@ object SingleAction extends Logging {
10791079

10801080
lazy val nullLitForAddCDCFile: Column =
10811081
new Column(Literal(null, ScalaReflection.schemaFor[AddCDCFile].dataType))
1082+
1083+
lazy val nullLitForMetadataAction: Column =
1084+
new Column(Literal(null, ScalaReflection.schemaFor[Metadata].dataType))
10821085
}
10831086

10841087
/** Serializes Maps containing JSON strings without extra escaping. */

core/src/main/scala/org/apache/spark/sql/delta/sources/DeltaDataSource.scala

+24-11
Original file line numberDiff line numberDiff line change
@@ -90,9 +90,8 @@ class DeltaDataSource
9090

9191
val (_, snapshot) = DeltaLog.forTableWithSnapshot(sqlContext.sparkSession, path)
9292
val readSchema = {
93-
getSchemaLogForDeltaSource(sqlContext.sparkSession, snapshot, parameters)
94-
// Use `getSchemaAtLogInit` so it's always consistent between analysis and execution phase
95-
.flatMap(_.getSchemaAtLogInit.map(_.dataSchema))
93+
getSchemaTrackingLogForDeltaSource(sqlContext.sparkSession, snapshot, parameters)
94+
.flatMap(_.getCurrentTrackedSchema.map(_.dataSchema))
9695
.getOrElse(snapshot.schema)
9796
}
9897

@@ -122,10 +121,12 @@ class DeltaDataSource
122121
})
123122
val options = new DeltaOptions(parameters, sqlContext.sparkSession.sessionState.conf)
124123
val (deltaLog, snapshot) = DeltaLog.forTableWithSnapshot(sqlContext.sparkSession, path)
125-
val schemaLogOpt =
126-
getSchemaLogForDeltaSource(sqlContext.sparkSession, snapshot, parameters)
127-
val readSchema = schemaLogOpt
128-
.flatMap(_.getSchemaAtLogInit.map(_.dataSchema))
124+
125+
val schemaTrackingLogOpt =
126+
getSchemaTrackingLogForDeltaSource(sqlContext.sparkSession, snapshot, parameters)
127+
128+
val readSchema = schemaTrackingLogOpt
129+
.flatMap(_.getCurrentTrackedSchema.map(_.dataSchema))
129130
.getOrElse(snapshot.schema)
130131

131132
if (readSchema.isEmpty) {
@@ -136,7 +137,7 @@ class DeltaDataSource
136137
deltaLog,
137138
options,
138139
snapshot,
139-
schemaLog = schemaLogOpt
140+
schemaTrackingLog = schemaTrackingLogOpt
140141
)
141142
}
142143

@@ -237,14 +238,26 @@ class DeltaDataSource
237238
/**
238239
* Create a schema log for Delta streaming source if possible
239240
*/
240-
private def getSchemaLogForDeltaSource(
241+
private def getSchemaTrackingLogForDeltaSource(
241242
spark: SparkSession,
242243
sourceSnapshot: Snapshot,
243-
parameters: Map[String, String]): Option[DeltaSourceSchemaLog] = {
244+
parameters: Map[String, String]): Option[DeltaSourceSchemaTrackingLog] = {
244245
val options = new CaseInsensitiveStringMap(parameters.asJava)
245246
Option(options.get(DeltaOptions.SCHEMA_TRACKING_LOCATION))
247+
.orElse(Option(options.get(DeltaOptions.SCHEMA_TRACKING_LOCATION_ALIAS)))
246248
.map { schemaTrackingLocation =>
247-
DeltaSourceSchemaLog.create(
249+
if (!spark.sessionState.conf.getConf(
250+
DeltaSQLConf.DELTA_STREAMING_ENABLE_NON_ADDITIVE_SCHEMA_EVOLUTION)) {
251+
// TODO: remove once non-additive schema evolution is released
252+
throw new UnsupportedOperationException(
253+
"Schema tracking location is not supported for Delta streaming source")
254+
}
255+
if (Option(options.get(DeltaOptions.CDC_READ_OPTION)).exists(_.toBoolean)) {
256+
// TODO: remove once we support CDC streaming with schema log
257+
throw new UnsupportedOperationException(
258+
"Reading change data feed and streaming is not supported with schema tracking log")
259+
}
260+
DeltaSourceSchemaTrackingLog.create(
248261
spark, schemaTrackingLocation, sourceSnapshot,
249262
Option(options.get(DeltaOptions.STREAMING_SOURCE_TRACKING_ID)))
250263
}

core/src/main/scala/org/apache/spark/sql/delta/sources/DeltaSQLConf.scala

+11-1
Original file line numberDiff line numberDiff line change
@@ -901,7 +901,7 @@ trait DeltaSQLConfBase {
901901
.booleanConf
902902
.createWithDefault(false)
903903

904-
val DELTA_STREAMING_UNSAFE_READ_ON_INCOMPATIBLE_SCHEMA_CHANGES_DURING_STREAM_SATRT =
904+
val DELTA_STREAMING_UNSAFE_READ_ON_INCOMPATIBLE_SCHEMA_CHANGES_DURING_STREAM_START =
905905
buildConf("streaming.unsafeReadOnIncompatibleSchemaChangesDuringStreamStart.enabled")
906906
.doc(
907907
"""A legacy config to disable schema read-compatibility check on the start version schema
@@ -912,6 +912,16 @@ trait DeltaSQLConfBase {
912912
.booleanConf
913913
.createWithDefault(false)
914914

915+
val DELTA_STREAMING_ENABLE_NON_ADDITIVE_SCHEMA_EVOLUTION =
916+
buildConf("streaming.nonAdditiveSchemaEvolution.enabled")
917+
.doc(
918+
"""If enabled, Delta streaming source can support non-additive schema evolution for
919+
|operations such as rename or drop column on column mapping enabled tables.
920+
|""".stripMargin)
921+
.internal()
922+
.booleanConf
923+
.createWithDefault(false)
924+
915925
val DELTA_STREAMING_ALLOW_SCHEMA_LOCATION_OUTSIDE_CHECKPOINT_LOCATION =
916926
buildConf("streaming.allowSchemaLocationOutsideCheckpointLocation")
917927
.doc(

0 commit comments

Comments
 (0)