diff --git a/extension/parquet/include/parquet_reader.hpp b/extension/parquet/include/parquet_reader.hpp index 77d9500d52e..a7ffd59b863 100644 --- a/extension/parquet/include/parquet_reader.hpp +++ b/extension/parquet/include/parquet_reader.hpp @@ -111,6 +111,11 @@ class ParquetReader { MultiFileReaderData reader_data; unique_ptr root_reader; + //! Index of the file_row_number column + idx_t file_row_number_idx = DConstants::INVALID_INDEX; + //! Parquet schema for the generated columns + vector generated_column_schema; + public: void InitializeScan(ParquetReaderScanState &state, vector groups_to_read); void Scan(ParquetReaderScanState &state, DataChunk &output); diff --git a/extension/parquet/parquet_extension.cpp b/extension/parquet/parquet_extension.cpp index 439472f4078..076453d16a8 100644 --- a/extension/parquet/parquet_extension.cpp +++ b/extension/parquet/parquet_extension.cpp @@ -185,6 +185,17 @@ static MultiFileReaderBindData BindSchema(ClientContext &context, vector= parquet_options.schema.size()) { + if (bind_data.reader_bind.file_row_number_idx == global_column_index) { + reader_data.column_mapping.push_back(i); + reader_data.column_ids.push_back(reader.file_row_number_idx); + } + continue; + } + const auto &column_definition = parquet_options.schema[global_column_index]; auto it = field_id_to_column_index.find(column_definition.field_id); if (it == field_id_to_column_index.end()) { // field id not present in file, use default value - reader_data.constant_map.emplace_back(global_column_index, column_definition.default_value); + reader_data.constant_map.emplace_back(i, column_definition.default_value); continue; } @@ -249,7 +269,7 @@ static void InitializeParquetReader(ParquetReader &reader, const ParquetReadBind reader_data.cast_map[local_column_index] = column_definition.type; } - reader_data.column_mapping.push_back(global_column_index); + reader_data.column_mapping.push_back(i); reader_data.column_ids.push_back(local_column_index); } reader_data.empty_columns = reader_data.column_ids.empty(); @@ -384,6 +404,7 @@ class ParquetScanFunction { // a schema was supplied result->reader_bind = BindSchema(context, result->types, result->names, *result, parquet_options); } + if (return_types.empty()) { // no expected types - just copy the types return_types = result->types; diff --git a/extension/parquet/parquet_reader.cpp b/extension/parquet/parquet_reader.cpp index 05eec27e69e..75e35b3d712 100644 --- a/extension/parquet/parquet_reader.cpp +++ b/extension/parquet/parquet_reader.cpp @@ -381,8 +381,11 @@ unique_ptr ParquetReader::CreateReader() { root_struct_reader.child_readers[column_idx] = std::move(cast_reader); } if (parquet_options.file_row_number) { - root_struct_reader.child_readers.push_back( - make_uniq(*this, LogicalType::BIGINT, SchemaElement(), next_file_idx, 0, 0)); + file_row_number_idx = root_struct_reader.child_readers.size(); + + generated_column_schema.push_back(SchemaElement()); + root_struct_reader.child_readers.push_back(make_uniq( + *this, LogicalType::BIGINT, generated_column_schema.back(), next_file_idx, 0, 0)); } return ret; diff --git a/src/include/duckdb/common/multi_file_reader.hpp b/src/include/duckdb/common/multi_file_reader.hpp index d6a39a53f80..ca52810e8df 100644 --- a/src/include/duckdb/common/multi_file_reader.hpp +++ b/src/include/duckdb/common/multi_file_reader.hpp @@ -40,6 +40,8 @@ struct MultiFileReaderBindData { idx_t filename_idx = DConstants::INVALID_INDEX; //! The set of hive partitioning indexes (if any) vector hive_partitioning_indexes; + //! The index of the file_row_number column (if any) + idx_t file_row_number_idx = DConstants::INVALID_INDEX; DUCKDB_API void Serialize(Serializer &serializer) const; DUCKDB_API static MultiFileReaderBindData Deserialize(Deserializer &deserializer); diff --git a/test/parquet/test_parquet_schema.test b/test/parquet/test_parquet_schema.test index 4be770610a3..a7712718a3d 100644 --- a/test/parquet/test_parquet_schema.test +++ b/test/parquet/test_parquet_schema.test @@ -183,6 +183,18 @@ FROM read_parquet('__TEST_DIR__/integers.parquet', schema=map { ---- 5 +# projection still, even with different generated columns +query III +SELECT file_row_number, filename[-16:], i4 +FROM read_parquet('__TEST_DIR__/integers.parquet', schema=map { + 1: {name: 'i1', type: 'BIGINT', default_value: NULL}, + 3: {name: 'i3', type: 'BIGINT', default_value: NULL}, + 4: {name: 'i4', type: 'BIGINT', default_value: 2}, + 5: {name: 'i5', type: 'BIGINT', default_value: NULL} + }, file_row_number=1, filename=1) +---- +0 integers.parquet 2 + # count(*) still ok query I SELECT count(*)