Skip to content

Commit a5d7c69

Browse files
authored
[Kernel][Defaults] Support reading parquet files with legacy 3-level repeated types (#3083)
## Description When legacy mode is enabled in Spark, array physical types are stored slightly different from the standard format. Standard mode (default): ``` optional group readerFeatures (LIST) { repeated group list { optional binary element (STRING); } } ``` When write legacy mode is enabled (`spark.sql.parquet.writeLegacyFormat = true`): ``` optional group readerFeatures (LIST) { repeated group bag { optional binary array (STRING); } } ``` TODO: We need to handle the 2-level lists. Will post a separate PR. The challenge is with generating or finding the Parquet files with 2-level lists. ## How was this patch tested? Added tests Fixes #3082
1 parent c2f23d7 commit a5d7c69

13 files changed

+87
-35
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
{"commitInfo":{"timestamp":1715358358979,"operation":"WRITE","operationParameters":{"mode":"Append","partitionBy":"[]"},"isolationLevel":"Serializable","isBlindAppend":true,"operationMetrics":{"numFiles":"1","numOutputRows":"200","numOutputBytes":"20934"},"engineInfo":"Apache-Spark/3.5.0 Delta-Lake/3.2.0-SNAPSHOT","txnId":"01d57c6f-6073-484f-b832-5cf368644e4b"}}
2+
{"metaData":{"id":"fb723383-fb90-4346-a846-ccdac9a3204b","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"ByteType\",\"type\":\"byte\",\"nullable\":true,\"metadata\":{}},{\"name\":\"ShortType\",\"type\":\"short\",\"nullable\":true,\"metadata\":{}},{\"name\":\"IntegerType\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}},{\"name\":\"LongType\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}},{\"name\":\"FloatType\",\"type\":\"float\",\"nullable\":true,\"metadata\":{}},{\"name\":\"DoubleType\",\"type\":\"double\",\"nullable\":true,\"metadata\":{}},{\"name\":\"decimal\",\"type\":\"decimal(10,2)\",\"nullable\":true,\"metadata\":{}},{\"name\":\"BooleanType\",\"type\":\"boolean\",\"nullable\":true,\"metadata\":{}},{\"name\":\"StringType\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"BinaryType\",\"type\":\"binary\",\"nullable\":true,\"metadata\":{}},{\"name\":\"DateType\",\"type\":\"date\",\"nullable\":true,\"metadata\":{}},{\"name\":\"TimestampType\",\"type\":\"timestamp\",\"nullable\":true,\"metadata\":{}},{\"name\":\"TimestampNTZType\",\"type\":\"timestamp_ntz\",\"nullable\":true,\"metadata\":{}},{\"name\":\"nested_struct\",\"type\":{\"type\":\"struct\",\"fields\":[{\"name\":\"aa\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"ac\",\"type\":{\"type\":\"struct\",\"fields\":[{\"name\":\"aca\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}}]},\"nullable\":true,\"metadata\":{}}]},\"nullable\":true,\"metadata\":{}},{\"name\":\"array_of_prims\",\"type\":{\"type\":\"array\",\"elementType\":\"integer\",\"containsNull\":true},\"nullable\":true,\"metadata\":{}},{\"name\":\"array_of_arrays\",\"type\":{\"type\":\"array\",\"elementType\":{\"type\":\"array\",\"elementType\":\"integer\",\"containsNull\":true},\"containsNull\":true},\"nullable\":true,\"metadata\":{}},{\"name\":\"array_of_structs\",\"type\":{\"type\":\"array\",\"elementType\":{\"type\":\"struct\",\"fields\":[{\"name\":\"ab\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}}]},\"containsNull\":true},\"nullable\":true,\"metadata\":{}},{\"name\":\"map_of_prims\",\"type\":{\"type\":\"map\",\"keyType\":\"integer\",\"valueType\":\"long\",\"valueContainsNull\":true},\"nullable\":true,\"metadata\":{}},{\"name\":\"map_of_rows\",\"type\":{\"type\":\"map\",\"keyType\":\"integer\",\"valueType\":{\"type\":\"struct\",\"fields\":[{\"name\":\"ab\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}}]},\"valueContainsNull\":true},\"nullable\":true,\"metadata\":{}},{\"name\":\"map_of_arrays\",\"type\":{\"type\":\"map\",\"keyType\":\"long\",\"valueType\":{\"type\":\"array\",\"elementType\":\"integer\",\"containsNull\":true},\"valueContainsNull\":true},\"nullable\":true,\"metadata\":{}}]}","partitionColumns":[],"configuration":{},"createdTime":1715358356845}}
3+
{"protocol":{"minReaderVersion":3,"minWriterVersion":7,"readerFeatures":["timestampNtz"],"writerFeatures":["timestampNtz"]}}
4+
{"add":{"path":"part-00000-5afb67f1-094a-4a15-922e-c1eb96683964-c000.snappy.parquet","partitionValues":{},"size":20934,"modificationTime":1715358358904,"dataChange":true,"stats":"{\"numRecords\":200,\"minValues\":{\"ByteType\":-128,\"ShortType\":1,\"IntegerType\":1,\"LongType\":2,\"FloatType\":0.234,\"DoubleType\":234234.23,\"decimal\":123.52,\"StringType\":\"1\",\"DateType\":\"1970-01-01\",\"TimestampType\":\"1970-01-01T06:30:23.523Z\",\"TimestampNTZType\":\"1970-01-03T17:03:54.000\",\"nested_struct\":{\"aa\":\"1\",\"ac\":{\"aca\":1}}},\"maxValues\":{\"ByteType\":127,\"ShortType\":199,\"IntegerType\":199,\"LongType\":200,\"FloatType\":46.566,\"DoubleType\":4.661261177E7,\"decimal\":24580.48,\"StringType\":\"99\",\"DateType\":\"1970-02-16\",\"TimestampType\":\"1970-02-23T22:48:01.077Z\",\"TimestampNTZType\":\"1971-06-24T11:56:06.000\",\"nested_struct\":{\"aa\":\"99\",\"ac\":{\"aca\":199}}},\"nullCount\":{\"ByteType\":3,\"ShortType\":4,\"IntegerType\":9,\"LongType\":8,\"FloatType\":8,\"DoubleType\":4,\"decimal\":3,\"BooleanType\":3,\"StringType\":4,\"BinaryType\":4,\"DateType\":4,\"TimestampType\":4,\"TimestampNTZType\":3,\"nested_struct\":{\"aa\":14,\"ac\":{\"aca\":22}},\"array_of_prims\":200,\"array_of_arrays\":200,\"array_of_structs\":200,\"map_of_prims\":200,\"map_of_rows\":200,\"map_of_arrays\":200}}"}}
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
{"commitInfo":{"timestamp":1713368423544,"operation":"WRITE","operationParameters":{"mode":"Append","partitionBy":"[]"},"isolationLevel":"Serializable","isBlindAppend":true,"operationMetrics":{"numFiles":"1","numOutputRows":"200","numOutputBytes":"30312"},"engineInfo":"Apache-Spark/3.5.0 Delta-Lake/3.2.0-SNAPSHOT","txnId":"d711414e-08c1-46d0-a5c5-d5faad64d59e"}}
2-
{"metaData":{"id":"7027c1fa-69c4-4867-ace8-6f8c44a022d0","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"ByteType\",\"type\":\"byte\",\"nullable\":true,\"metadata\":{}},{\"name\":\"ShortType\",\"type\":\"short\",\"nullable\":true,\"metadata\":{}},{\"name\":\"IntegerType\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}},{\"name\":\"LongType\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}},{\"name\":\"FloatType\",\"type\":\"float\",\"nullable\":true,\"metadata\":{}},{\"name\":\"DoubleType\",\"type\":\"double\",\"nullable\":true,\"metadata\":{}},{\"name\":\"decimal\",\"type\":\"decimal(10,2)\",\"nullable\":true,\"metadata\":{}},{\"name\":\"BooleanType\",\"type\":\"boolean\",\"nullable\":true,\"metadata\":{}},{\"name\":\"StringType\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"BinaryType\",\"type\":\"binary\",\"nullable\":true,\"metadata\":{}},{\"name\":\"DateType\",\"type\":\"date\",\"nullable\":true,\"metadata\":{}},{\"name\":\"TimestampType\",\"type\":\"timestamp\",\"nullable\":true,\"metadata\":{}},{\"name\":\"TimestampNTZType\",\"type\":\"timestamp_ntz\",\"nullable\":true,\"metadata\":{}},{\"name\":\"nested_struct\",\"type\":{\"type\":\"struct\",\"fields\":[{\"name\":\"aa\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"ac\",\"type\":{\"type\":\"struct\",\"fields\":[{\"name\":\"aca\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}}]},\"nullable\":true,\"metadata\":{}}]},\"nullable\":true,\"metadata\":{}},{\"name\":\"array_of_prims\",\"type\":{\"type\":\"array\",\"elementType\":\"integer\",\"containsNull\":true},\"nullable\":true,\"metadata\":{}},{\"name\":\"array_of_arrays\",\"type\":{\"type\":\"array\",\"elementType\":{\"type\":\"array\",\"elementType\":\"integer\",\"containsNull\":true},\"containsNull\":true},\"nullable\":true,\"metadata\":{}},{\"name\":\"array_of_structs\",\"type\":{\"type\":\"array\",\"elementType\":{\"type\":\"struct\",\"fields\":[{\"name\":\"ab\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}}]},\"containsNull\":true},\"nullable\":true,\"metadata\":{}},{\"name\":\"map_of_prims\",\"type\":{\"type\":\"map\",\"keyType\":\"integer\",\"valueType\":\"long\",\"valueContainsNull\":true},\"nullable\":true,\"metadata\":{}},{\"name\":\"map_of_rows\",\"type\":{\"type\":\"map\",\"keyType\":\"integer\",\"valueType\":{\"type\":\"struct\",\"fields\":[{\"name\":\"ab\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}}]},\"valueContainsNull\":true},\"nullable\":true,\"metadata\":{}},{\"name\":\"map_of_arrays\",\"type\":{\"type\":\"map\",\"keyType\":\"long\",\"valueType\":{\"type\":\"array\",\"elementType\":\"integer\",\"containsNull\":true},\"valueContainsNull\":true},\"nullable\":true,\"metadata\":{}}]}","partitionColumns":[],"configuration":{},"createdTime":1713368421437}}
1+
{"commitInfo":{"timestamp":1715358308005,"operation":"WRITE","operationParameters":{"mode":"Append","partitionBy":"[]"},"isolationLevel":"Serializable","isBlindAppend":true,"operationMetrics":{"numFiles":"1","numOutputRows":"200","numOutputBytes":"21057"},"engineInfo":"Apache-Spark/3.5.0 Delta-Lake/3.2.0-SNAPSHOT","txnId":"c84f1a78-0895-4f01-b00e-f3a984c8afca"}}
2+
{"metaData":{"id":"ab49cd9e-a908-4aad-a15b-9dd117d3e0ab","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"ByteType\",\"type\":\"byte\",\"nullable\":true,\"metadata\":{}},{\"name\":\"ShortType\",\"type\":\"short\",\"nullable\":true,\"metadata\":{}},{\"name\":\"IntegerType\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}},{\"name\":\"LongType\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}},{\"name\":\"FloatType\",\"type\":\"float\",\"nullable\":true,\"metadata\":{}},{\"name\":\"DoubleType\",\"type\":\"double\",\"nullable\":true,\"metadata\":{}},{\"name\":\"decimal\",\"type\":\"decimal(10,2)\",\"nullable\":true,\"metadata\":{}},{\"name\":\"BooleanType\",\"type\":\"boolean\",\"nullable\":true,\"metadata\":{}},{\"name\":\"StringType\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"BinaryType\",\"type\":\"binary\",\"nullable\":true,\"metadata\":{}},{\"name\":\"DateType\",\"type\":\"date\",\"nullable\":true,\"metadata\":{}},{\"name\":\"TimestampType\",\"type\":\"timestamp\",\"nullable\":true,\"metadata\":{}},{\"name\":\"TimestampNTZType\",\"type\":\"timestamp_ntz\",\"nullable\":true,\"metadata\":{}},{\"name\":\"nested_struct\",\"type\":{\"type\":\"struct\",\"fields\":[{\"name\":\"aa\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"ac\",\"type\":{\"type\":\"struct\",\"fields\":[{\"name\":\"aca\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}}]},\"nullable\":true,\"metadata\":{}}]},\"nullable\":true,\"metadata\":{}},{\"name\":\"array_of_prims\",\"type\":{\"type\":\"array\",\"elementType\":\"integer\",\"containsNull\":true},\"nullable\":true,\"metadata\":{}},{\"name\":\"array_of_arrays\",\"type\":{\"type\":\"array\",\"elementType\":{\"type\":\"array\",\"elementType\":\"integer\",\"containsNull\":true},\"containsNull\":true},\"nullable\":true,\"metadata\":{}},{\"name\":\"array_of_structs\",\"type\":{\"type\":\"array\",\"elementType\":{\"type\":\"struct\",\"fields\":[{\"name\":\"ab\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}}]},\"containsNull\":true},\"nullable\":true,\"metadata\":{}},{\"name\":\"map_of_prims\",\"type\":{\"type\":\"map\",\"keyType\":\"integer\",\"valueType\":\"long\",\"valueContainsNull\":true},\"nullable\":true,\"metadata\":{}},{\"name\":\"map_of_rows\",\"type\":{\"type\":\"map\",\"keyType\":\"integer\",\"valueType\":{\"type\":\"struct\",\"fields\":[{\"name\":\"ab\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}}]},\"valueContainsNull\":true},\"nullable\":true,\"metadata\":{}},{\"name\":\"map_of_arrays\",\"type\":{\"type\":\"map\",\"keyType\":\"long\",\"valueType\":{\"type\":\"array\",\"elementType\":\"integer\",\"containsNull\":true},\"valueContainsNull\":true},\"nullable\":true,\"metadata\":{}}]}","partitionColumns":[],"configuration":{},"createdTime":1715358307675}}
33
{"protocol":{"minReaderVersion":3,"minWriterVersion":7,"readerFeatures":["timestampNtz"],"writerFeatures":["timestampNtz"]}}
4-
{"add":{"path":"part-00000-981a72ea-9b1a-4bf3-8c3c-3d7e56a7bb45-c000.snappy.parquet","partitionValues":{},"size":30312,"modificationTime":1713368423449,"dataChange":true,"stats":"{\"numRecords\":200,\"minValues\":{\"ByteType\":-128,\"ShortType\":1,\"IntegerType\":1,\"LongType\":2,\"FloatType\":0.234,\"DoubleType\":234234.23,\"decimal\":123.52,\"StringType\":\"1\",\"DateType\":\"1970-01-01\",\"TimestampType\":\"1970-01-01T06:30:23.523Z\",\"TimestampNTZType\":\"1970-01-03T17:03:54.000\",\"nested_struct\":{\"aa\":\"1\",\"ac\":{\"aca\":1}}},\"maxValues\":{\"ByteType\":127,\"ShortType\":199,\"IntegerType\":199,\"LongType\":200,\"FloatType\":46.566,\"DoubleType\":4.661261177E7,\"decimal\":24580.48,\"StringType\":\"99\",\"DateType\":\"1970-02-16\",\"TimestampType\":\"1970-02-23T22:48:01.077Z\",\"TimestampNTZType\":\"1971-06-24T11:56:06.000\",\"nested_struct\":{\"aa\":\"99\",\"ac\":{\"aca\":199}}},\"nullCount\":{\"ByteType\":3,\"ShortType\":4,\"IntegerType\":9,\"LongType\":8,\"FloatType\":8,\"DoubleType\":4,\"decimal\":3,\"BooleanType\":3,\"StringType\":4,\"BinaryType\":4,\"DateType\":4,\"TimestampType\":4,\"TimestampNTZType\":3,\"nested_struct\":{\"aa\":14,\"ac\":{\"aca\":22}},\"array_of_prims\":8,\"array_of_arrays\":25,\"array_of_structs\":0,\"map_of_prims\":8,\"map_of_rows\":0,\"map_of_arrays\":7}}"}}
4+
{"add":{"path":"part-00000-bf6680d4-5e83-4fce-8ebb-d2b60d7e69c9-c000.snappy.parquet","partitionValues":{},"size":21057,"modificationTime":1715358307997,"dataChange":true,"stats":"{\"numRecords\":200,\"minValues\":{\"ByteType\":-128,\"ShortType\":1,\"IntegerType\":1,\"LongType\":2,\"FloatType\":0.234,\"DoubleType\":234234.23,\"decimal\":123.52,\"StringType\":\"1\",\"DateType\":\"1970-01-01\",\"TimestampType\":\"1970-01-01T06:30:23.523Z\",\"TimestampNTZType\":\"1970-01-03T17:03:54.000\",\"nested_struct\":{\"aa\":\"1\",\"ac\":{\"aca\":1}}},\"maxValues\":{\"ByteType\":127,\"ShortType\":199,\"IntegerType\":199,\"LongType\":200,\"FloatType\":46.566,\"DoubleType\":4.661261177E7,\"decimal\":24580.48,\"StringType\":\"99\",\"DateType\":\"1970-02-16\",\"TimestampType\":\"1970-02-23T22:48:01.077Z\",\"TimestampNTZType\":\"1971-06-24T11:56:06.000\",\"nested_struct\":{\"aa\":\"99\",\"ac\":{\"aca\":199}}},\"nullCount\":{\"ByteType\":3,\"ShortType\":4,\"IntegerType\":9,\"LongType\":8,\"FloatType\":8,\"DoubleType\":4,\"decimal\":3,\"BooleanType\":3,\"StringType\":4,\"BinaryType\":4,\"DateType\":4,\"TimestampType\":4,\"TimestampNTZType\":3,\"nested_struct\":{\"aa\":14,\"ac\":{\"aca\":22}},\"array_of_prims\":200,\"array_of_arrays\":200,\"array_of_structs\":200,\"map_of_prims\":200,\"map_of_rows\":200,\"map_of_arrays\":200}}"}}

connectors/golden-tables/src/test/scala/io/delta/golden/GoldenTables.scala

Lines changed: 24 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1038,7 +1038,18 @@ class GoldenTables extends QueryTest with SharedSparkSession {
10381038
writeBasicTimestampTable(tablePath, TimeZone.getTimeZone("PST"))
10391039
}
10401040

1041+
generateGoldenTable("parquet-all-types-legacy-format") { tablePath =>
1042+
withSQLConf(("spark.sql.parquet.writeLegacyFormat", "true")) {
1043+
generateAllTypesTable(tablePath)
1044+
}
1045+
}
1046+
10411047
generateGoldenTable("parquet-all-types") { tablePath =>
1048+
// generating using the standard parquet format
1049+
generateAllTypesTable(tablePath)
1050+
}
1051+
1052+
def generateAllTypesTable(tablePath: String): Unit = {
10421053
val timeZone = java.util.TimeZone.getTimeZone("UTC")
10431054
java.util.TimeZone.setDefault(timeZone)
10441055
import java.sql._
@@ -1113,6 +1124,7 @@ class GoldenTables extends QueryTest with SharedSparkSession {
11131124
if (i % 61 != 0) new java.sql.Date(i * 20000000L) else null,
11141125
if (i % 62 != 0) new Timestamp(i * 23423523L) else null,
11151126
if (i % 69 != 0) LocalDateTime.ofEpochSecond(i * 234234L, 200012, UTC) else null,
1127+
// nested_struct
11161128
if (i % 63 != 0) {
11171129
if (i % 19 == 0) {
11181130
// write a struct with all fields null
@@ -1121,13 +1133,15 @@ class GoldenTables extends QueryTest with SharedSparkSession {
11211133
Row(i.toString, if (i % 23 != 0) Row(i) else null)
11221134
}
11231135
} else null,
1136+
// array_of_prims
11241137
if (i % 25 != 0) {
11251138
if (i % 29 == 0) {
11261139
scala.Array()
11271140
} else {
11281141
scala.Array(i, null, i + 1)
11291142
}
11301143
} else null,
1144+
// array_of_arrays
11311145
if (i % 8 != 0) {
11321146
val singleElemArray = scala.Array(i)
11331147
val doubleElemArray = scala.Array(i + 10, i + 20)
@@ -1144,7 +1158,11 @@ class GoldenTables extends QueryTest with SharedSparkSession {
11441158
case 6 => scala.Array()
11451159
}
11461160
} else null,
1147-
scala.Array(Row(i.longValue()), null),
1161+
// array_of_structs
1162+
if (i % 10 != 0) {
1163+
scala.Array(Row(i.longValue()), null)
1164+
} else null,
1165+
// map_of_prims
11481166
if (i % 28 != 0) {
11491167
if (i % 30 == 0) {
11501168
Map()
@@ -1155,7 +1173,11 @@ class GoldenTables extends QueryTest with SharedSparkSession {
11551173
)
11561174
}
11571175
} else null,
1158-
Map(i + 1 -> (if (i % 10 == 0) Row((i * 20).longValue()) else null)),
1176+
// map_of_rows
1177+
if (i % 25 != 0) {
1178+
Map(i + 1 -> (if (i % 10 == 0) Row((i * 20).longValue()) else null))
1179+
} else null,
1180+
// map_of_arrays
11591181
if (i % 30 != 0) {
11601182
if (i % 24 == 0) {
11611183
Map()
@@ -1181,7 +1203,6 @@ class GoldenTables extends QueryTest with SharedSparkSession {
11811203
.save(tablePath)
11821204
}
11831205

1184-
11851206
def writeBasicDecimalTable(tablePath: String): Unit = {
11861207
val data = Seq(
11871208
Seq("234", "1", "2", "3"),

0 commit comments

Comments
 (0)