@@ -135,10 +135,66 @@ class ParquetFileReaderSuite extends AnyFunSuite
135135 checkAnswer(actResult1, expResult1)
136136
137137 // File with multiple row-groups [0, 20000) where rowIndex = id
138- val filePath = getTestResourceFilePath(" parquet/" )
138+ val filePath = getTestResourceFilePath(" parquet/row_index_multiple_row_groups.parquet " )
139139 val actResult2 = readParquetFilesUsingKernel(filePath, readSchema)
140140 val expResult2 = (0L until 20000L ).map(i => TestRow (i, i))
141141
142142 checkAnswer(actResult2, expResult2)
143143 }
144+
145+ // ///////////////////////////////////////////////////////////////////////////////////////////////
146+ // Test compatibility with Parquet legacy format files //
147+ // ///////////////////////////////////////////////////////////////////////////////////////////////
148+
149+ // Test and the test file are copied from Spark's `ParquetThriftCompatibilitySuite`
150+ test(" read parquet file generated by parquet-thrift" ) {
151+ val parquetFilePath = getTestResourceFilePath(" parquet/parquet-thrift-compat.snappy.parquet" )
152+
153+ val readSchema = new StructType ()
154+ .add(" boolColumn" , BooleanType .BOOLEAN )
155+ .add(" byteColumn" , ByteType .BYTE )
156+ .add(" shortColumn" , ShortType .SHORT )
157+ .add(" intColumn" , IntegerType .INTEGER )
158+ .add(" longColumn" , LongType .LONG )
159+ .add(" doubleColumn" , DoubleType .DOUBLE )
160+ // Thrift `BINARY` values are actually unencoded `STRING` values, and thus are always
161+ // treated as `BINARY (UTF8)` in parquet-thrift, since parquet-thrift always assume
162+ // Thrift `STRING`s are encoded using UTF-8.
163+ .add(" binaryColumn" , StringType .STRING )
164+ .add(" stringColumn" , StringType .STRING )
165+ .add(" enumColumn" , StringType .STRING )
166+ // maybe indicates nullable columns, above ones are non-nullable
167+ .add(" maybeBoolColumn" , BooleanType .BOOLEAN )
168+ .add(" maybeByteColumn" , ByteType .BYTE )
169+ .add(" maybeShortColumn" , ShortType .SHORT )
170+ .add(" maybeIntColumn" , IntegerType .INTEGER )
171+ .add(" maybeLongColumn" , LongType .LONG )
172+ .add(" maybeDoubleColumn" , DoubleType .DOUBLE )
173+ // Thrift `BINARY` values are actually unencoded `STRING` values, and thus are always
174+ // treated as `BINARY (UTF8)` in parquet-thrift, since parquet-thrift always assume
175+ // Thrift `STRING`s are encoded using UTF-8.
176+ .add(" maybeBinaryColumn" , StringType .STRING )
177+ .add(" maybeStringColumn" , StringType .STRING )
178+ .add(" maybeEnumColumn" , StringType .STRING )
179+ // TODO: not working - separate PR to handle 2-level legacy lists
180+ // .add("stringsColumn", new ArrayType(StringType.STRING, true /* containsNull */))
181+ // .add("intSetColumn", new ArrayType(IntegerType.INTEGER, true /* containsNull */))
182+ .add(" intToStringColumn" ,
183+ new MapType (IntegerType .INTEGER , StringType .STRING , true /* valueContainsNull */ ))
184+ // TODO: not working - separate PR to handle 2-level legacy lists
185+ // .add("complexColumn", new MapType(
186+ // IntegerType.INTEGER,
187+ // new ArrayType(
188+ // new StructType()
189+ // .add("nestedIntsColumn", new ArrayType(IntegerType.INTEGER, true /* containsNull */))
190+ // .add("nestedStringColumn", StringType.STRING)
191+ // .add("stringColumn", StringType.STRING),
192+ // true /* containsNull */),
193+ // true /* valueContainsNull */))
194+
195+ assert(parquetFileRowCount(parquetFilePath) === 10 )
196+ checkAnswer(
197+ readParquetFilesUsingKernel(parquetFilePath, readSchema), /* actual */
198+ readParquetFilesUsingSpark(parquetFilePath, readSchema) /* expected */ )
199+ }
144200}
0 commit comments