Skip to content

Commit fc39f78

Browse files
xupefeivkorukanti
authored andcommitted
Improve DV path canonicalization
## Description This PR improves the FILE_PATH canonicalization logic by avoiding calling expensive `Path.toUri.toString` calls for each row in a table. Canonicalized paths are now cached and the UDF just needs to look it up. Future improvement is possible for handling huge logs: build `canonicalizedPathMap` in a distributed way. Related PR target the 2.4 branch: #1829. Existing tests. Closes #1836 Signed-off-by: Paddy Xu <xupaddy@gmail.com> GitOrigin-RevId: c4810852f9136c36ec21f3519620ca26ed12bb04
1 parent 6583b2e commit fc39f78

File tree

1 file changed

+21
-5
lines changed

1 file changed

+21
-5
lines changed

spark/src/main/scala/org/apache/spark/sql/delta/commands/DeleteWithDeletionVectorsHelper.scala

+21-5
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ import org.apache.spark.sql.delta.util.DeltaFileOperations.absolutePath
3535
import org.apache.hadoop.conf.Configuration
3636
import org.apache.hadoop.fs.Path
3737

38+
import org.apache.spark.paths.SparkPath
3839
import org.apache.spark.sql.{Column, DataFrame, Dataset, Encoder, SparkSession}
3940
import org.apache.spark.sql.catalyst.expressions.{AttributeReference, Expression, FileSourceMetadataAttribute}
4041
import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, Project}
@@ -341,11 +342,16 @@ object DeletionVectorBitmapGenerator {
341342
: Seq[DeletionVectorResult] = {
342343
// TODO: fix this to work regardless of whether Spark encodes or doesn't encode
343344
// _metadata.file_path. See https://github.com/delta-io/delta/issues/1725
344-
val uriEncode = DeltaUDF.stringFromString(path => {
345-
new Path(path).toUri.toString
346-
})
345+
// Build two maps, using Path or String as keys. The one with String keys is used in UDF.
346+
val canonicalizedPathMap = buildCanonicalizedPathMap(txn.deltaLog, candidateFiles)
347+
val canonicalizedPathStringMap =
348+
canonicalizedPathMap.map { case (k, v) => k.toString -> v }
349+
val broadcastCanonicalizedPathStringMap =
350+
sparkSession.sparkContext.broadcast(canonicalizedPathStringMap)
351+
352+
val lookupPathUdf = DeltaUDF.stringFromString(broadcastCanonicalizedPathStringMap.value(_))
347353
val matchedRowsDf = targetDf
348-
.withColumn(FILE_NAME_COL, uriEncode(col(s"${METADATA_NAME}.${FILE_PATH}")))
354+
.withColumn(FILE_NAME_COL, lookupPathUdf(col(s"${METADATA_NAME}.${FILE_PATH}")))
349355
// Filter after getting input file name as the filter might introduce a join and we
350356
// cannot get input file name on join's output.
351357
.filter(new Column(condition))
@@ -358,7 +364,7 @@ object DeletionVectorBitmapGenerator {
358364
val filePathToDV = candidateFiles.map { add =>
359365
val serializedDV = Option(add.deletionVector).map(dvd => JsonUtils.toJson(dvd))
360366
// Paths in the metadata column are canonicalized. Thus we must canonicalize the DV path.
361-
FileToDvDescriptor(absolutePath(basePath, add.path).toUri.toString, serializedDV)
367+
FileToDvDescriptor(canonicalizedPathMap(absolutePath(basePath, add.path)), serializedDV)
362368
}
363369
val filePathToDVDf = sparkSession.createDataset(filePathToDV)
364370

@@ -379,6 +385,16 @@ object DeletionVectorBitmapGenerator {
379385

380386
DeletionVectorBitmapGenerator.buildDeletionVectors(sparkSession, df, txn.deltaLog, txn)
381387
}
388+
389+
private def buildCanonicalizedPathMap(
390+
log: DeltaLog,
391+
addFiles: Seq[AddFile]): Map[Path, String] = {
392+
val basePath = log.dataPath.toString
393+
addFiles.map { add =>
394+
val absPath = absolutePath(basePath, add.path)
395+
absPath -> SparkPath.fromPath(absPath).urlEncoded
396+
}.toMap
397+
}
382398
}
383399

384400
/**

0 commit comments

Comments
 (0)