@@ -27,6 +27,7 @@ import org.apache.spark.sql.delta.constraints.{Constraint, Constraints, DeltaInv
27
27
import org .apache .spark .sql .delta .metering .DeltaLogging
28
28
import org .apache .spark .sql .delta .schema ._
29
29
import org .apache .spark .sql .delta .sources .DeltaSQLConf
30
+ import org .apache .spark .sql .delta .sources .DeltaSQLConf .DELTA_COLLECT_STATS_USING_TABLE_SCHEMA
30
31
import org .apache .spark .sql .delta .stats .{DeltaJobStatisticsTracker , StatisticsCollection }
31
32
import org .apache .commons .lang3 .exception .ExceptionUtils
32
33
import org .apache .hadoop .fs .Path
@@ -241,24 +242,25 @@ trait TransactionalWrite extends DeltaLogging { self: OptimisticTransactionImpl
241
242
}
242
243
243
244
/**
244
- * Return a tuple of (statsDataSchema , statsCollectionSchema).
245
- * statsDataSchema is the data source schema from DataFrame used for stats collection. It
246
- * contains the columns in the DataFrame output, excluding the partition columns.
247
- * statsCollectionSchema is the schema to collect stats for. It contains the columns in the
245
+ * Return a tuple of (outputStatsCollectionSchema , statsCollectionSchema).
246
+ * outputStatsCollectionSchema is the data source schema from DataFrame used for stats collection.
247
+ * It contains the columns in the DataFrame output, excluding the partition columns.
248
+ * tableStatsCollectionSchema is the schema to collect stats for. It contains the columns in the
248
249
* table schema, excluding the partition columns.
249
250
* Note: We only collect NULL_COUNT stats (as the number of rows) for the columns in
250
- * statsCollectionSchema but missing in statsDataSchema
251
+ * statsCollectionSchema but missing in outputStatsCollectionSchema
251
252
*/
252
253
protected def getStatsSchema (
253
254
dataFrameOutput : Seq [Attribute ],
254
255
partitionSchema : StructType ): (Seq [Attribute ], Seq [Attribute ]) = {
255
256
val partitionColNames = partitionSchema.map(_.name).toSet
256
257
257
- // statsDataSchema comes from DataFrame output
258
+ // The outputStatsCollectionSchema comes from DataFrame output
258
259
// schema should be normalized, therefore we can do an equality check
259
- val statsDataSchema = dataFrameOutput.filterNot(c => partitionColNames.contains(c.name))
260
+ val outputStatsCollectionSchema = dataFrameOutput
261
+ .filterNot(c => partitionColNames.contains(c.name))
260
262
261
- // statsCollectionSchema comes from table schema
263
+ // The tableStatsCollectionSchema comes from table schema
262
264
val statsTableSchema = metadata.schema.toAttributes
263
265
val mappedStatsTableSchema = if (metadata.columnMappingMode == NoMapping ) {
264
266
statsTableSchema
@@ -267,10 +269,10 @@ trait TransactionalWrite extends DeltaLogging { self: OptimisticTransactionImpl
267
269
}
268
270
269
271
// It's important to first do the column mapping and then drop the partition columns
270
- val filteredStatsTableSchema = mappedStatsTableSchema
272
+ val tableStatsCollectionSchema = mappedStatsTableSchema
271
273
.filterNot(c => partitionColNames.contains(c.name))
272
274
273
- (statsDataSchema, filteredStatsTableSchema )
275
+ (outputStatsCollectionSchema, tableStatsCollectionSchema )
274
276
}
275
277
276
278
protected def getStatsColExpr (
@@ -291,33 +293,33 @@ trait TransactionalWrite extends DeltaLogging { self: OptimisticTransactionImpl
291
293
Option [StatisticsCollection ]) = {
292
294
if (spark.sessionState.conf.getConf(DeltaSQLConf .DELTA_COLLECT_STATS )) {
293
295
294
- val (statsDataSchema, statsCollectionSchema) = getStatsSchema(output, partitionSchema)
295
-
296
- val indexedCols = DeltaConfigs .DATA_SKIPPING_NUM_INDEXED_COLS .fromMetaData(metadata)
296
+ val (outputStatsCollectionSchema, tableStatsCollectionSchema) =
297
+ getStatsSchema(output, partitionSchema)
297
298
298
299
val statsCollection = new StatisticsCollection {
299
- override def tableDataSchema = {
300
- // If collecting stats using the table schema, then pass in statsCollectionSchema.
301
- // Otherwise pass in statsDataSchema to collect stats using the DataFrame schema.
302
- if (spark.sessionState.conf.getConf(DeltaSQLConf
303
- .DELTA_COLLECT_STATS_USING_TABLE_SCHEMA )) {
304
- statsCollectionSchema.toStructType
300
+ override val columnMappingMode : DeltaColumnMappingMode = metadata.columnMappingMode
301
+ override def tableSchema : StructType = metadata.schema
302
+ override def outputTableStatsSchema : StructType = {
303
+ // If collecting stats uses the table schema, then we pass in tableStatsCollectionSchema;
304
+ // otherwise, pass in outputStatsCollectionSchema to collect stats using the DataFrame
305
+ // schema.
306
+ if (spark.sessionState.conf.getConf(DELTA_COLLECT_STATS_USING_TABLE_SCHEMA )) {
307
+ tableStatsCollectionSchema.toStructType
305
308
} else {
306
- statsDataSchema .toStructType
309
+ outputStatsCollectionSchema .toStructType
307
310
}
308
311
}
309
- override def dataSchema = statsDataSchema .toStructType
312
+ override def outputAttributeSchema : StructType = outputStatsCollectionSchema .toStructType
310
313
override val spark : SparkSession = data.sparkSession
311
- override val numIndexedCols = indexedCols
314
+ override val statsColumnSpec = StatisticsCollection .configuredDeltaStatsColumnSpec(metadata)
312
315
override val protocol : Protocol = newProtocol.getOrElse(snapshot.protocol)
313
316
}
314
-
315
- val statsColExpr = getStatsColExpr(statsDataSchema, statsCollection)
317
+ val statsColExpr = getStatsColExpr(outputStatsCollectionSchema, statsCollection)
316
318
317
319
(Some (new DeltaJobStatisticsTracker (
318
320
deltaLog.newDeltaHadoopConf(),
319
321
outputPath,
320
- statsDataSchema ,
322
+ outputStatsCollectionSchema ,
321
323
statsColExpr)), Some (statsCollection))
322
324
} else {
323
325
(None , None )
0 commit comments