Skip to content

Commit 75c6acb

Browse files
authored
[Spark] Support show tblproperties and update catalog for clustered table (#3271)
<!-- Thanks for sending a pull request! Here are some tips for you: 1. If this is your first time, please read our contributor guidelines: https://github.com/delta-io/delta/blob/master/CONTRIBUTING.md 2. If the PR is unfinished, add '[WIP]' in your PR title, e.g., '[WIP] Your PR title ...'. 3. Be sure to keep the PR description updated to reflect all changes. 4. Please write your PR title to summarize what this PR proposes. 5. If possible, provide a concise example to reproduce the issue for a faster review. 6. If applicable, include the corresponding issue number in the PR title and link it in the body. --> #### Which Delta project/connector is this regarding? <!-- Please add the component selected below to the beginning of the pull request title For example: [Spark] Title of my pull request --> - [x] Spark - [ ] Standalone - [ ] Flink - [ ] Kernel - [ ] Other (fill in here) ## Description <!-- - Describe what this PR changes. - Describe why we need the change. If this PR resolves an issue be sure to include "Resolves #XXX" to correctly link and close the issue upon merge. --> Support show tblproperties for clustered table, and support updating the clustering column properties in the catalog. Remove table properties from describe detail's output since that's using the properties from metadata. ## How was this patch tested? <!-- If tests were added, say they were added here. Please make sure to test the changes thoroughly including negative and positive cases if possible. If the changes were tested in any way other than unit tests, please clarify how you tested step by step (ideally copy and paste-able, so that other reviewers can test and check, and descendants can verify in the future). If the changes were not tested, please explain why. --> Add verification for table properties, describe detail, and catalog table in verifyClusteringColumns. ## Does this PR introduce _any_ user-facing changes? <!-- If yes, please clarify the previous behavior and the change this PR proposes - provide the console output, description and/or an example to show the behavior difference if possible. If possible, please also clarify if this is a user-facing change compared to the released Delta Lake versions or within the unreleased branches such as master. If no, write 'No'. --> No
1 parent 96ae1c5 commit 75c6acb

File tree

9 files changed

+174
-17
lines changed

9 files changed

+174
-17
lines changed

spark/src/main/scala/org/apache/spark/sql/delta/catalog/DeltaCatalog.scala

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -449,7 +449,9 @@ class DeltaCatalog extends DelegatingCatalogExtension
449449
var validatedConfigurations =
450450
DeltaConfigs.validateConfigurations(tableDesc.properties)
451451
ClusteredTableUtils.validateExistingTableFeatureProperties(validatedConfigurations)
452-
// Add needed configs for Clustered table.
452+
// Add needed configs for Clustered table. Note that [[PROP_CLUSTERING_COLUMNS]] can only
453+
// be added after [[DeltaConfigs.validateConfigurations]] to avoid non-user configurable check
454+
// failure.
453455
if (maybeClusterBySpec.nonEmpty) {
454456
validatedConfigurations =
455457
validatedConfigurations ++

spark/src/main/scala/org/apache/spark/sql/delta/catalog/DeltaTableV2.scala

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,8 @@ import java.{util => ju}
2222
import scala.collection.JavaConverters._
2323
import scala.collection.mutable
2424

25+
import org.apache.spark.sql.delta.skipping.clustering.{ClusteredTableUtils, ClusteringColumnInfo}
26+
import org.apache.spark.sql.delta.skipping.clustering.temp.ClusterBySpec
2527
import org.apache.spark.sql.delta._
2628
import org.apache.spark.sql.delta.commands.WriteIntoDelta
2729
import org.apache.spark.sql.delta.commands.cdc.CDCReader
@@ -193,6 +195,13 @@ case class DeltaTableV2(
193195
base.put(TableCatalog.PROP_EXTERNAL, "true")
194196
}
195197
}
198+
// Don't use [[PROP_CLUSTERING_COLUMNS]] from CatalogTable because it may be stale.
199+
// Since ALTER TABLE updates it using an async post-commit hook.
200+
clusterBySpec.foreach { clusterBy =>
201+
ClusterBySpec.toProperties(clusterBy).foreach { case (key, value) =>
202+
base.put(key, value)
203+
}
204+
}
196205
Option(initialSnapshot.metadata.description).foreach(base.put(TableCatalog.PROP_COMMENT, _))
197206
base.asJava
198207
}
@@ -322,6 +331,17 @@ case class DeltaTableV2(
322331
override def v1Table: CatalogTable = ttSafeCatalogTable.getOrElse {
323332
throw DeltaErrors.invalidV1TableCall("v1Table", "DeltaTableV2")
324333
}
334+
335+
lazy val clusterBySpec: Option[ClusterBySpec] = {
336+
// Always get the clustering columns from metadata domain in delta log.
337+
if (ClusteredTableUtils.isSupported(initialSnapshot.protocol)) {
338+
val clusteringColumns = ClusteringColumnInfo.extractLogicalNames(
339+
initialSnapshot)
340+
Some(ClusterBySpec.fromColumnNames(clusteringColumns))
341+
} else {
342+
None
343+
}
344+
}
325345
}
326346

327347
object DeltaTableV2 {

spark/src/main/scala/org/apache/spark/sql/delta/hooks/UpdateCatalog.scala

Lines changed: 35 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,8 @@ import scala.concurrent.{ExecutionContext, Future, TimeoutException}
2424
import scala.util.Try
2525
import scala.util.control.NonFatal
2626

27+
import org.apache.spark.sql.delta.skipping.clustering.{ClusteredTableUtils, ClusteringColumnInfo}
28+
import org.apache.spark.sql.delta.skipping.clustering.temp.ClusterBySpec
2729
import org.apache.spark.sql.delta.{DeltaConfigs, DeltaTableIdentifier, OptimisticTransactionImpl, Snapshot}
2830
import org.apache.spark.sql.delta.actions.{Action, Metadata}
2931
import org.apache.spark.sql.delta.metering.DeltaLogging
@@ -123,6 +125,20 @@ trait UpdateCatalogBase extends PostCommitHook with DeltaLogging {
123125
return DeltaTableIdentifier.isDeltaPath(spark, table.identifier)
124126
}
125127

128+
/** Check if the clustering columns from snapshot doesn't match what's in the table properties. */
129+
protected def clusteringColumnsChanged(snapshot: Snapshot): Boolean = {
130+
if (!ClusteredTableUtils.isSupported(snapshot.protocol)) {
131+
return false
132+
}
133+
val currentLogicalClusteringNames =
134+
ClusteringColumnInfo.extractLogicalNames(snapshot).mkString(",")
135+
val clusterBySpecOpt = ClusterBySpec.fromProperties(table.properties)
136+
137+
// Since we don't remove the clustering columns table property, this can't happen.
138+
assert(!(currentLogicalClusteringNames.nonEmpty && clusterBySpecOpt.isEmpty))
139+
clusterBySpecOpt.exists(_.columnNames.map(_.toString).mkString(",") !=
140+
currentLogicalClusteringNames)
141+
}
126142

127143
/** Update the entry in the Catalog to reflect the latest schema and table properties. */
128144
protected def execute(
@@ -161,6 +177,15 @@ trait UpdateCatalogBase extends PostCommitHook with DeltaLogging {
161177
"delta.catalog.update.properties",
162178
data = loggingData
163179
)
180+
} else if (clusteringColumnsChanged(snapshot)) {
181+
// If the clustering columns changed, we'll update the catalog with the new
182+
// table properties.
183+
updateProperties(spark, snapshot)
184+
recordDeltaEvent(
185+
snapshot.deltaLog,
186+
"delta.catalog.update.clusteringColumns",
187+
data = loggingData
188+
)
164189
}
165190
} catch {
166191
case NonFatal(e) =>
@@ -259,7 +284,8 @@ case class UpdateCatalog(table: CatalogTable) extends UpdateCatalogBase {
259284
}
260285

261286
object UpdateCatalog {
262-
private var tp: ExecutionContext = _
287+
// Exposed for testing.
288+
private[delta] var tp: ExecutionContext = _
263289

264290
// This is the encoding of the database for the Hive MetaStore
265291
private val latin1 = Charset.forName("ISO-8859-1")
@@ -344,6 +370,14 @@ object UpdateCatalog {
344370
snapshot.getProperties.toMap ++ Map(
345371
DeltaConfigs.METASTORE_LAST_UPDATE_VERSION -> snapshot.version.toString,
346372
DeltaConfigs.METASTORE_LAST_COMMIT_TIMESTAMP -> snapshot.timestamp.toString)
373+
if (ClusteredTableUtils.isSupported(snapshot.protocol)) {
374+
val clusteringColumns = ClusteringColumnInfo.extractLogicalNames(snapshot)
375+
val properties = ClusterBySpec.toProperties(
376+
ClusterBySpec.fromColumnNames(clusteringColumns))
377+
properties.foreach { case (key, value) =>
378+
newProperties += (key -> value)
379+
}
380+
}
347381
newProperties
348382
}
349383

spark/src/main/scala/org/apache/spark/sql/delta/schema/ImplicitMetadataOperation.scala

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -107,7 +107,9 @@ trait ImplicitMetadataOperation extends DeltaLogging {
107107
throw DeltaErrors.unexpectedDataChangeException("Create a Delta table")
108108
}
109109
val description = configuration.get("comment").orNull
110-
val cleanedConfs = configuration.filterKeys(_ != "comment").toMap
110+
// Filter out the property for clustering columns from Metadata action.
111+
val cleanedConfs = ClusteredTableUtils.removeClusteringColumnsProperty(
112+
configuration.filterKeys(_ != "comment").toMap)
111113
txn.updateMetadata(
112114
Metadata(
113115
description = description,

spark/src/main/scala/org/apache/spark/sql/delta/skipping/clustering/temp/ClusterBySpec.scala

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,17 @@ object ClusterBySpec {
6767
ClusteredTableUtils.PROP_CLUSTERING_COLUMNS -> clusterBySpec.toJson
6868
}
6969

70+
def fromProperties(properties: Map[String, String]): Option[ClusterBySpec] = {
71+
properties.get(ClusteredTableUtils.PROP_CLUSTERING_COLUMNS).map { clusteringColumns =>
72+
fromProperty(clusteringColumns)
73+
}
74+
}
75+
76+
def toProperties(clusterBySpec: ClusterBySpec): Map[String, String] = {
77+
val columnValue = mapper.writeValueAsString(clusterBySpec.columnNames.map(_.fieldNames))
78+
Map(ClusteredTableUtils.PROP_CLUSTERING_COLUMNS -> columnValue)
79+
}
80+
7081
def fromColumnNames(names: Seq[String]): ClusterBySpec = {
7182
ClusterBySpec(names.map(FieldReference(_)))
7283
}

spark/src/test/scala/io/delta/tables/DeltaTableBuilderSuite.scala

Lines changed: 12 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -466,15 +466,19 @@ class DeltaTableBuilderSuite
466466
}
467467

468468
test("create table with clustering") {
469-
withTable("test") {
470-
io.delta.tables.DeltaTable.create().tableName("test")
471-
.addColumn("c1", "int")
472-
.clusterBy("c1")
473-
.execute()
469+
withSQLConf(
470+
// Enable update catalog for verifyClusteringColumns.
471+
DeltaSQLConf.DELTA_UPDATE_CATALOG_ENABLED.key -> "true") {
472+
withTable("test") {
473+
io.delta.tables.DeltaTable.create().tableName("test")
474+
.addColumn("c1", "int")
475+
.clusterBy("c1")
476+
.execute()
474477

475-
val deltaLog = DeltaLog.forTable(spark, TableIdentifier("test"))
476-
val metadata = deltaLog.snapshot.metadata
477-
verifyClusteringColumns(TableIdentifier("test"), Seq("c1"))
478+
val deltaLog = DeltaLog.forTable(spark, TableIdentifier("test"))
479+
val metadata = deltaLog.snapshot.metadata
480+
verifyClusteringColumns(TableIdentifier("test"), Seq("c1"))
481+
}
478482
}
479483
}
480484

spark/src/test/scala/org/apache/spark/sql/delta/skipping/ClusteredTableTestUtils.scala

Lines changed: 41 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,10 +17,14 @@
1717
package org.apache.spark.sql.delta.skipping
1818

1919
import org.apache.spark.sql.delta.skipping.clustering.{ClusteredTableUtils, ClusteringColumn, ClusteringColumnInfo}
20+
import org.apache.spark.sql.delta.skipping.clustering.temp.ClusterBySpec
2021
import org.apache.spark.sql.delta.{DeltaLog, Snapshot}
2122
import org.apache.spark.sql.delta.DeltaOperations.{CLUSTERING_PARAMETER_KEY, ZORDER_PARAMETER_KEY}
2223
import org.apache.spark.sql.delta.commands.optimize.OptimizeMetrics
24+
import org.apache.spark.sql.delta.hooks.UpdateCatalog
25+
import org.apache.spark.sql.delta.sources.DeltaSQLConf
2326
import org.apache.spark.sql.delta.util.JsonUtils
27+
import org.junit.Assert.assertEquals
2428

2529
import org.apache.spark.SparkFunSuite
2630
import org.apache.spark.sql.DataFrame
@@ -210,14 +214,32 @@ trait ClusteredTableTestUtilsBase extends SparkFunSuite with SharedSparkSession
210214

211215
def verifyClusteringColumns(
212216
tableIdentifier: TableIdentifier,
213-
expectedLogicalClusteringColumns: Seq[String]
217+
expectedLogicalClusteringColumns: Seq[String],
218+
skipCatalogCheck: Boolean = false
214219
): Unit = {
215220
val (_, snapshot) = DeltaLog.forTableWithSnapshot(spark, tableIdentifier)
216221
verifyClusteringColumnsInternal(
217222
snapshot,
218223
tableIdentifier.table,
219224
expectedLogicalClusteringColumns
220225
)
226+
227+
if (skipCatalogCheck) {
228+
return
229+
}
230+
231+
val updateCatalogEnabled = spark.conf.get(DeltaSQLConf.DELTA_UPDATE_CATALOG_ENABLED)
232+
assert(updateCatalogEnabled,
233+
"need to enable [[DeltaSQLConf.DELTA_UPDATE_CATALOG_ENABLED]] to verify catalog updates.")
234+
UpdateCatalog.awaitCompletion(10000)
235+
val catalog = spark.sessionState.catalog
236+
catalog.refreshTable(tableIdentifier)
237+
val table = catalog.getTableMetadata(tableIdentifier)
238+
239+
// Verify CatalogTable's clusterBySpec.
240+
assert(ClusteredTableUtils.getClusterBySpecOptional(table).isDefined)
241+
assertEquals(ClusterBySpec.fromColumnNames(expectedLogicalClusteringColumns),
242+
ClusteredTableUtils.getClusterBySpecOptional(table).get)
221243
}
222244

223245
def verifyClusteringColumns(
@@ -243,6 +265,24 @@ trait ClusteredTableTestUtilsBase extends SparkFunSuite with SharedSparkSession
243265
verifyDescribeHistoryOperationParameters(
244266
tableNameOrPath
245267
)
268+
269+
// Verify DESCRIBE DETAIL's properties doesn't contain the "clusteringColumns" key.
270+
val describeDetailProps = sql(s"describe detail $tableNameOrPath")
271+
.select("properties")
272+
.first
273+
.getAs[Map[String, String]](0)
274+
assert(!describeDetailProps.contains(ClusteredTableUtils.PROP_CLUSTERING_COLUMNS))
275+
276+
// Verify SHOW TBLPROPERTIES contains the correct clustering columns.
277+
val clusteringColumnsVal =
278+
sql(s"show tblproperties $tableNameOrPath")
279+
.filter($"key" === ClusteredTableUtils.PROP_CLUSTERING_COLUMNS)
280+
.select("value")
281+
.first
282+
.getString(0)
283+
val clusterBySpec = ClusterBySpec.fromProperties(
284+
Map(ClusteredTableUtils.PROP_CLUSTERING_COLUMNS -> clusteringColumnsVal)).get
285+
assert(expectedLogicalClusteringColumns === clusterBySpec.columnNames.map(_.toString))
246286
}
247287
}
248288

spark/src/test/scala/org/apache/spark/sql/delta/skipping/clustering/ClusteredTableDDLSuite.scala

Lines changed: 46 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ import com.databricks.spark.util.{Log4jUsageLogger, MetricDefinitions}
2222
import org.apache.spark.sql.delta.skipping.ClusteredTableTestUtils
2323
import org.apache.spark.sql.delta.{DeltaAnalysisException, DeltaColumnMappingEnableIdMode, DeltaColumnMappingEnableNameMode, DeltaConfigs, DeltaExcludedBySparkVersionTestMixinShims, DeltaLog, DeltaUnsupportedOperationException}
2424
import org.apache.spark.sql.delta.clustering.ClusteringMetadataDomain
25+
import org.apache.spark.sql.delta.hooks.UpdateCatalog
2526
import org.apache.spark.sql.delta.sources.DeltaSQLConf
2627
import org.apache.spark.sql.delta.stats.SkippingEligibleDataType
2728
import org.apache.spark.sql.delta.test.{DeltaColumnMappingSelectedTestMixin, DeltaSQLCommandTest}
@@ -37,6 +38,35 @@ trait ClusteredTableCreateOrReplaceDDLSuiteBase extends QueryTest
3738
with SharedSparkSession
3839
with ClusteredTableTestUtils {
3940

41+
override def beforeAll(): Unit = {
42+
super.beforeAll()
43+
spark.conf.set(DeltaSQLConf.DELTA_UPDATE_CATALOG_ENABLED.key, "true")
44+
}
45+
46+
override def afterAll(): Unit = {
47+
// Reset UpdateCatalog's thread pool to ensure it is re-initialized in the next test suite.
48+
// This is necessary because the [[SparkThreadLocalForwardingThreadPoolExecutor]]
49+
// retains a reference to the SparkContext. Without resetting, the new test suite would
50+
// reuse the same SparkContext from the previous suite, despite it being stopped.
51+
//
52+
// This will force the UpdateCatalog's background thread to use the new SparkContext.
53+
//
54+
// scalastyle:off line.size.limit
55+
// This is to avoid the following exception thrown from the UpdateCatalog's background thread:
56+
// java.lang.IllegalStateException: Cannot call methods on a stopped SparkContext.
57+
// This stopped SparkContext was created at:
58+
//
59+
// org.apache.spark.sql.delta.skipping.clustering.ClusteredTableDDLDataSourceV2NameColumnMappingSuite.beforeAll
60+
//
61+
// The currently active SparkContext was created at:
62+
//
63+
// org.apache.spark.sql.delta.skipping.clustering.ClusteredTableDDLDataSourceV2Suite.beforeAll
64+
// scalastyle:on line.size.limit
65+
UpdateCatalog.tp = null
66+
67+
super.afterAll()
68+
}
69+
4070
protected val testTable: String = "test_ddl_table"
4171
protected val sourceTable: String = "test_ddl_source"
4272
protected val targetTable: String = "test_ddl_target"
@@ -627,6 +657,17 @@ trait ClusteredTableDDLSuiteBase
627657
}
628658
}
629659

660+
test("alter table cluster by - catalog reflects clustering columns when reordered") {
661+
withClusteredTable(testTable, "id INT, a STRUCT<b INT, c STRING>, name STRING", "id, name") {
662+
val tableIdentifier = TableIdentifier(testTable)
663+
verifyClusteringColumns(tableIdentifier, Seq("id", "name"))
664+
665+
// Re-order the clustering keys and validate the catalog sees the correctly reordered keys.
666+
sql(s"ALTER TABLE $testTable CLUSTER BY (name, id)")
667+
verifyClusteringColumns(tableIdentifier, Seq("name", "id"))
668+
}
669+
}
670+
630671
test("alter table cluster by - error scenarios") {
631672
withClusteredTable(testTable, "id INT, id2 INT, name STRING", "id, name") {
632673
// Specify non-existing columns.
@@ -862,7 +903,7 @@ trait ClusteredTableDDLSuiteBase
862903

863904
sql(s"RESTORE TABLE $testTable TO VERSION AS OF 0")
864905
val (_, currentSnapshot) = DeltaLog.forTableWithSnapshot(spark, tableIdentifier)
865-
verifyClusteringColumns(tableIdentifier, Seq.empty)
906+
verifyClusteringColumns(tableIdentifier, Seq.empty, skipCatalogCheck = true)
866907
}
867908

868909
// Scenario 2: restore clustered table to previous clustering columns.
@@ -873,7 +914,7 @@ trait ClusteredTableDDLSuiteBase
873914
verifyClusteringColumns(tableIdentifier, Seq("b"))
874915

875916
sql(s"RESTORE TABLE $testTable TO VERSION AS OF 0")
876-
verifyClusteringColumns(tableIdentifier, Seq("a"))
917+
verifyClusteringColumns(tableIdentifier, Seq("a"), skipCatalogCheck = true)
877918
}
878919

879920
// Scenario 3: restore from table with clustering columns to non-empty clustering columns
@@ -884,7 +925,7 @@ trait ClusteredTableDDLSuiteBase
884925
verifyClusteringColumns(tableIdentifier, Seq.empty)
885926

886927
sql(s"RESTORE TABLE $testTable TO VERSION AS OF 0")
887-
verifyClusteringColumns(tableIdentifier, Seq("a"))
928+
verifyClusteringColumns(tableIdentifier, Seq("a"), skipCatalogCheck = true)
888929
}
889930

890931
// Scenario 4: restore to start version.
@@ -894,7 +935,7 @@ trait ClusteredTableDDLSuiteBase
894935
sql(s"INSERT INTO $testTable VALUES (1)")
895936

896937
sql(s"RESTORE TABLE $testTable TO VERSION AS OF 0")
897-
verifyClusteringColumns(tableIdentifier, Seq("a"))
938+
verifyClusteringColumns(tableIdentifier, Seq("a"), skipCatalogCheck = true)
898939
}
899940

900941
// Scenario 5: restore unclustered table to unclustered table.
@@ -933,6 +974,7 @@ trait ClusteredTableDDLSuiteBase
933974
}
934975

935976
trait ClusteredTableDDLSuite extends ClusteredTableDDLSuiteBase
977+
936978
trait ClusteredTableDDLWithNameColumnMapping
937979
extends ClusteredTableCreateOrReplaceDDLSuite with DeltaColumnMappingEnableNameMode
938980

spark/src/test/scala/org/apache/spark/sql/delta/skipping/clustering/IncrementalZCubeClusteringSuite.scala

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -191,7 +191,9 @@ class IncrementalZCubeClusteringSuite extends QueryTest
191191

192192
test("test changing clustering columns") {
193193
withSQLConf(
194-
SQLConf.MAX_RECORDS_PER_FILE.key -> "2") {
194+
SQLConf.MAX_RECORDS_PER_FILE.key -> "2",
195+
// Enable update catalog for verifyClusteringColumns.
196+
DeltaSQLConf.DELTA_UPDATE_CATALOG_ENABLED.key -> "true") {
195197
withClusteredTable(
196198
table = table,
197199
schema = "col1 int, col2 int",

0 commit comments

Comments
 (0)