Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions src/duckdb/extension/parquet/include/parquet_reader.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,7 @@ struct ParquetOptions {

vector<ParquetColumnDefinition> schema;
idx_t explicit_cardinality = 0;
bool can_have_nan = false; // if floats or doubles can contain NaN values
};

struct ParquetOptionsSerialization {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ class ResizeableBuffer;
struct ParquetStatisticsUtils {

static unique_ptr<BaseStatistics> TransformColumnStatistics(const ParquetColumnSchema &reader,
const vector<ColumnChunk> &columns);
const vector<ColumnChunk> &columns, bool can_have_nan);

static Value ConvertValue(const LogicalType &type, const ParquetColumnSchema &schema_ele, const std::string &stats);

Expand Down
12 changes: 12 additions & 0 deletions src/duckdb/extension/parquet/parquet_multi_file_info.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -318,6 +318,7 @@ TableFunctionSet ParquetScanFunction::GetFunctionSet() {
table_function.named_parameters["schema"] = LogicalTypeId::ANY;
table_function.named_parameters["encryption_config"] = LogicalTypeId::ANY;
table_function.named_parameters["parquet_version"] = LogicalType::VARCHAR;
table_function.named_parameters["can_have_nan"] = LogicalType::BOOLEAN;
table_function.statistics = MultiFileFunction<ParquetMultiFileInfo>::MultiFileScanStats;
table_function.serialize = ParquetScanSerialize;
table_function.deserialize = ParquetScanDeserialize;
Expand Down Expand Up @@ -365,6 +366,13 @@ bool ParquetMultiFileInfo::ParseCopyOption(ClientContext &context, const string
options.encryption_config = ParquetEncryptionConfig::Create(context, values[0]);
return true;
}
if (key == "can_have_nan") {
if (values.size() != 1) {
throw BinderException("Parquet can_have_nan cannot be empty!");
}
options.can_have_nan = GetBooleanArgument(key, values);
return true;
}
return false;
}

Expand Down Expand Up @@ -393,6 +401,10 @@ bool ParquetMultiFileInfo::ParseOption(ClientContext &context, const string &ori
options.debug_use_openssl = BooleanValue::Get(val);
return true;
}
if (key == "can_have_nan") {
options.can_have_nan = BooleanValue::Get(val);
return true;
}
if (key == "schema") {
// Argument is a map that defines the schema
const auto &schema_value = val;
Expand Down
5 changes: 3 additions & 2 deletions src/duckdb/extension/parquet/parquet_reader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -501,7 +501,7 @@ unique_ptr<BaseStatistics> ParquetColumnSchema::Stats(ParquetReader &reader, idx
stats.Set(StatsInfo::CANNOT_HAVE_NULL_VALUES);
return stats.ToUnique();
}
return ParquetStatisticsUtils::TransformColumnStatistics(*this, columns);
return ParquetStatisticsUtils::TransformColumnStatistics(*this, columns, reader.parquet_options.can_have_nan);
}

ParquetColumnSchema ParquetReader::ParseSchemaRecursive(idx_t depth, idx_t max_define, idx_t max_repeat,
Expand Down Expand Up @@ -1052,7 +1052,8 @@ void ParquetReader::PrepareRowGroupBuffer(ParquetReaderScanState &state, idx_t i
*stats, group.columns[column_reader.ColumnIndex()].meta_data.statistics, filter);
} else if (!is_generated_column && has_min_max &&
(column_reader.Type().id() == LogicalTypeId::FLOAT ||
column_reader.Type().id() == LogicalTypeId::DOUBLE)) {
column_reader.Type().id() == LogicalTypeId::DOUBLE) &&
parquet_options.can_have_nan) {
// floating point columns can have NaN values in addition to the min/max bounds defined in the file
// in order to do optimal pruning - we prune based on the [min, max] of the file followed by pruning
// based on nan
Expand Down
16 changes: 13 additions & 3 deletions src/duckdb/extension/parquet/parquet_statistics.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -304,7 +304,8 @@ Value ParquetStatisticsUtils::ConvertValueInternal(const LogicalType &type, cons
}

unique_ptr<BaseStatistics> ParquetStatisticsUtils::TransformColumnStatistics(const ParquetColumnSchema &schema,
const vector<ColumnChunk> &columns) {
const vector<ColumnChunk> &columns,
bool can_have_nan) {

// Not supported types
auto &type = schema.type;
Expand All @@ -320,7 +321,7 @@ unique_ptr<BaseStatistics> ParquetStatisticsUtils::TransformColumnStatistics(con
// Recurse into child readers
for (idx_t i = 0; i < schema.children.size(); i++) {
auto &child_schema = schema.children[i];
auto child_stats = ParquetStatisticsUtils::TransformColumnStatistics(child_schema, columns);
auto child_stats = ParquetStatisticsUtils::TransformColumnStatistics(child_schema, columns, can_have_nan);
StructStats::SetChildStats(struct_stats, i, std::move(child_stats));
}
row_group_stats = struct_stats.ToUnique();
Expand Down Expand Up @@ -363,7 +364,16 @@ unique_ptr<BaseStatistics> ParquetStatisticsUtils::TransformColumnStatistics(con
break;
case LogicalTypeId::FLOAT:
case LogicalTypeId::DOUBLE:
row_group_stats = CreateFloatingPointStats(type, schema, parquet_stats);
if (can_have_nan) {
// Since parquet doesn't tell us if the column has NaN values, if the user has explicitly declared that it
// does, we create stats without an upper max value, as NaN compares larger than anything else.
row_group_stats = CreateFloatingPointStats(type, schema, parquet_stats);
} else {
// Otherwise we use the numeric stats as usual, which might lead to "wrong" pruning if the column contains
// NaN values. The parquet spec is not clear on how to handle NaN values in statistics, and so this is
// probably the best we can do for now.
row_group_stats = CreateNumericStats(type, schema, parquet_stats);
}
break;
case LogicalTypeId::VARCHAR: {
auto string_stats = StringStats::CreateEmpty(type);
Expand Down
2 changes: 2 additions & 0 deletions src/duckdb/extension/parquet/serialize_parquet.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,7 @@ void ParquetOptionsSerialization::Serialize(Serializer &serializer) const {
serializer.WritePropertyWithDefault<shared_ptr<ParquetEncryptionConfig>>(104, "encryption_config", parquet_options.encryption_config, nullptr);
serializer.WritePropertyWithDefault<bool>(105, "debug_use_openssl", parquet_options.debug_use_openssl, true);
serializer.WritePropertyWithDefault<idx_t>(106, "explicit_cardinality", parquet_options.explicit_cardinality, 0);
serializer.WritePropertyWithDefault<bool>(107, "can_have_nan", parquet_options.can_have_nan, false);
}

ParquetOptionsSerialization ParquetOptionsSerialization::Deserialize(Deserializer &deserializer) {
Expand All @@ -84,6 +85,7 @@ ParquetOptionsSerialization ParquetOptionsSerialization::Deserialize(Deserialize
deserializer.ReadPropertyWithExplicitDefault<shared_ptr<ParquetEncryptionConfig>>(104, "encryption_config", result.parquet_options.encryption_config, nullptr);
deserializer.ReadPropertyWithExplicitDefault<bool>(105, "debug_use_openssl", result.parquet_options.debug_use_openssl, true);
deserializer.ReadPropertyWithExplicitDefault<idx_t>(106, "explicit_cardinality", result.parquet_options.explicit_cardinality, 0);
deserializer.ReadPropertyWithExplicitDefault<bool>(107, "can_have_nan", result.parquet_options.can_have_nan, false);
return result;
}

Expand Down
6 changes: 3 additions & 3 deletions src/duckdb/src/function/table/version/pragma_version.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#ifndef DUCKDB_PATCH_VERSION
#define DUCKDB_PATCH_VERSION "1-dev196"
#define DUCKDB_PATCH_VERSION "1-dev203"
#endif
#ifndef DUCKDB_MINOR_VERSION
#define DUCKDB_MINOR_VERSION 3
Expand All @@ -8,10 +8,10 @@
#define DUCKDB_MAJOR_VERSION 1
#endif
#ifndef DUCKDB_VERSION
#define DUCKDB_VERSION "v1.3.1-dev196"
#define DUCKDB_VERSION "v1.3.1-dev203"
#endif
#ifndef DUCKDB_SOURCE_ID
#define DUCKDB_SOURCE_ID "c5f80e216b"
#define DUCKDB_SOURCE_ID "a189a91bd3"
#endif
#include "duckdb/function/table/system_functions.hpp"
#include "duckdb/main/database.hpp"
Expand Down