diff --git a/src/duckdb/extension/parquet/include/parquet_reader.hpp b/src/duckdb/extension/parquet/include/parquet_reader.hpp index a4863b65c..97add9041 100644 --- a/src/duckdb/extension/parquet/include/parquet_reader.hpp +++ b/src/duckdb/extension/parquet/include/parquet_reader.hpp @@ -108,6 +108,7 @@ struct ParquetOptions { vector schema; idx_t explicit_cardinality = 0; + bool can_have_nan = false; // if floats or doubles can contain NaN values }; struct ParquetOptionsSerialization { diff --git a/src/duckdb/extension/parquet/include/parquet_statistics.hpp b/src/duckdb/extension/parquet/include/parquet_statistics.hpp index 2991a0415..fc53fa328 100644 --- a/src/duckdb/extension/parquet/include/parquet_statistics.hpp +++ b/src/duckdb/extension/parquet/include/parquet_statistics.hpp @@ -27,7 +27,7 @@ class ResizeableBuffer; struct ParquetStatisticsUtils { static unique_ptr TransformColumnStatistics(const ParquetColumnSchema &reader, - const vector &columns); + const vector &columns, bool can_have_nan); static Value ConvertValue(const LogicalType &type, const ParquetColumnSchema &schema_ele, const std::string &stats); diff --git a/src/duckdb/extension/parquet/parquet_multi_file_info.cpp b/src/duckdb/extension/parquet/parquet_multi_file_info.cpp index 56657c751..cf6c445d0 100644 --- a/src/duckdb/extension/parquet/parquet_multi_file_info.cpp +++ b/src/duckdb/extension/parquet/parquet_multi_file_info.cpp @@ -318,6 +318,7 @@ TableFunctionSet ParquetScanFunction::GetFunctionSet() { table_function.named_parameters["schema"] = LogicalTypeId::ANY; table_function.named_parameters["encryption_config"] = LogicalTypeId::ANY; table_function.named_parameters["parquet_version"] = LogicalType::VARCHAR; + table_function.named_parameters["can_have_nan"] = LogicalType::BOOLEAN; table_function.statistics = MultiFileFunction::MultiFileScanStats; table_function.serialize = ParquetScanSerialize; table_function.deserialize = ParquetScanDeserialize; @@ -365,6 +366,13 @@ bool ParquetMultiFileInfo::ParseCopyOption(ClientContext &context, const string options.encryption_config = ParquetEncryptionConfig::Create(context, values[0]); return true; } + if (key == "can_have_nan") { + if (values.size() != 1) { + throw BinderException("Parquet can_have_nan cannot be empty!"); + } + options.can_have_nan = GetBooleanArgument(key, values); + return true; + } return false; } @@ -393,6 +401,10 @@ bool ParquetMultiFileInfo::ParseOption(ClientContext &context, const string &ori options.debug_use_openssl = BooleanValue::Get(val); return true; } + if (key == "can_have_nan") { + options.can_have_nan = BooleanValue::Get(val); + return true; + } if (key == "schema") { // Argument is a map that defines the schema const auto &schema_value = val; diff --git a/src/duckdb/extension/parquet/parquet_reader.cpp b/src/duckdb/extension/parquet/parquet_reader.cpp index 638a0732d..2fa75cc7a 100644 --- a/src/duckdb/extension/parquet/parquet_reader.cpp +++ b/src/duckdb/extension/parquet/parquet_reader.cpp @@ -501,7 +501,7 @@ unique_ptr ParquetColumnSchema::Stats(ParquetReader &reader, idx stats.Set(StatsInfo::CANNOT_HAVE_NULL_VALUES); return stats.ToUnique(); } - return ParquetStatisticsUtils::TransformColumnStatistics(*this, columns); + return ParquetStatisticsUtils::TransformColumnStatistics(*this, columns, reader.parquet_options.can_have_nan); } ParquetColumnSchema ParquetReader::ParseSchemaRecursive(idx_t depth, idx_t max_define, idx_t max_repeat, @@ -1052,7 +1052,8 @@ void ParquetReader::PrepareRowGroupBuffer(ParquetReaderScanState &state, idx_t i *stats, group.columns[column_reader.ColumnIndex()].meta_data.statistics, filter); } else if (!is_generated_column && has_min_max && (column_reader.Type().id() == LogicalTypeId::FLOAT || - column_reader.Type().id() == LogicalTypeId::DOUBLE)) { + column_reader.Type().id() == LogicalTypeId::DOUBLE) && + parquet_options.can_have_nan) { // floating point columns can have NaN values in addition to the min/max bounds defined in the file // in order to do optimal pruning - we prune based on the [min, max] of the file followed by pruning // based on nan diff --git a/src/duckdb/extension/parquet/parquet_statistics.cpp b/src/duckdb/extension/parquet/parquet_statistics.cpp index 228ec6cbe..f0253a163 100644 --- a/src/duckdb/extension/parquet/parquet_statistics.cpp +++ b/src/duckdb/extension/parquet/parquet_statistics.cpp @@ -304,7 +304,8 @@ Value ParquetStatisticsUtils::ConvertValueInternal(const LogicalType &type, cons } unique_ptr ParquetStatisticsUtils::TransformColumnStatistics(const ParquetColumnSchema &schema, - const vector &columns) { + const vector &columns, + bool can_have_nan) { // Not supported types auto &type = schema.type; @@ -320,7 +321,7 @@ unique_ptr ParquetStatisticsUtils::TransformColumnStatistics(con // Recurse into child readers for (idx_t i = 0; i < schema.children.size(); i++) { auto &child_schema = schema.children[i]; - auto child_stats = ParquetStatisticsUtils::TransformColumnStatistics(child_schema, columns); + auto child_stats = ParquetStatisticsUtils::TransformColumnStatistics(child_schema, columns, can_have_nan); StructStats::SetChildStats(struct_stats, i, std::move(child_stats)); } row_group_stats = struct_stats.ToUnique(); @@ -363,7 +364,16 @@ unique_ptr ParquetStatisticsUtils::TransformColumnStatistics(con break; case LogicalTypeId::FLOAT: case LogicalTypeId::DOUBLE: - row_group_stats = CreateFloatingPointStats(type, schema, parquet_stats); + if (can_have_nan) { + // Since parquet doesn't tell us if the column has NaN values, if the user has explicitly declared that it + // does, we create stats without an upper max value, as NaN compares larger than anything else. + row_group_stats = CreateFloatingPointStats(type, schema, parquet_stats); + } else { + // Otherwise we use the numeric stats as usual, which might lead to "wrong" pruning if the column contains + // NaN values. The parquet spec is not clear on how to handle NaN values in statistics, and so this is + // probably the best we can do for now. + row_group_stats = CreateNumericStats(type, schema, parquet_stats); + } break; case LogicalTypeId::VARCHAR: { auto string_stats = StringStats::CreateEmpty(type); diff --git a/src/duckdb/extension/parquet/serialize_parquet.cpp b/src/duckdb/extension/parquet/serialize_parquet.cpp index a234bcee3..aa5632077 100644 --- a/src/duckdb/extension/parquet/serialize_parquet.cpp +++ b/src/duckdb/extension/parquet/serialize_parquet.cpp @@ -73,6 +73,7 @@ void ParquetOptionsSerialization::Serialize(Serializer &serializer) const { serializer.WritePropertyWithDefault>(104, "encryption_config", parquet_options.encryption_config, nullptr); serializer.WritePropertyWithDefault(105, "debug_use_openssl", parquet_options.debug_use_openssl, true); serializer.WritePropertyWithDefault(106, "explicit_cardinality", parquet_options.explicit_cardinality, 0); + serializer.WritePropertyWithDefault(107, "can_have_nan", parquet_options.can_have_nan, false); } ParquetOptionsSerialization ParquetOptionsSerialization::Deserialize(Deserializer &deserializer) { @@ -84,6 +85,7 @@ ParquetOptionsSerialization ParquetOptionsSerialization::Deserialize(Deserialize deserializer.ReadPropertyWithExplicitDefault>(104, "encryption_config", result.parquet_options.encryption_config, nullptr); deserializer.ReadPropertyWithExplicitDefault(105, "debug_use_openssl", result.parquet_options.debug_use_openssl, true); deserializer.ReadPropertyWithExplicitDefault(106, "explicit_cardinality", result.parquet_options.explicit_cardinality, 0); + deserializer.ReadPropertyWithExplicitDefault(107, "can_have_nan", result.parquet_options.can_have_nan, false); return result; } diff --git a/src/duckdb/src/function/table/version/pragma_version.cpp b/src/duckdb/src/function/table/version/pragma_version.cpp index c1d4a6677..0d3c52076 100644 --- a/src/duckdb/src/function/table/version/pragma_version.cpp +++ b/src/duckdb/src/function/table/version/pragma_version.cpp @@ -1,5 +1,5 @@ #ifndef DUCKDB_PATCH_VERSION -#define DUCKDB_PATCH_VERSION "1-dev196" +#define DUCKDB_PATCH_VERSION "1-dev203" #endif #ifndef DUCKDB_MINOR_VERSION #define DUCKDB_MINOR_VERSION 3 @@ -8,10 +8,10 @@ #define DUCKDB_MAJOR_VERSION 1 #endif #ifndef DUCKDB_VERSION -#define DUCKDB_VERSION "v1.3.1-dev196" +#define DUCKDB_VERSION "v1.3.1-dev203" #endif #ifndef DUCKDB_SOURCE_ID -#define DUCKDB_SOURCE_ID "c5f80e216b" +#define DUCKDB_SOURCE_ID "a189a91bd3" #endif #include "duckdb/function/table/system_functions.hpp" #include "duckdb/main/database.hpp"