diff --git a/c_glib/test/parquet/test-statistics.rb b/c_glib/test/parquet/test-statistics.rb index 4f21ebf00c868..0367084c88a49 100644 --- a/c_glib/test/parquet/test-statistics.rb +++ b/c_glib/test/parquet/test-statistics.rb @@ -51,7 +51,7 @@ def setup test("#has_n_distinct_values?") do assert do - @statistics.has_n_distinct_values? + not @statistics.has_n_distinct_values? end end diff --git a/cpp/src/parquet/statistics.cc b/cpp/src/parquet/statistics.cc index aa6df7e32a98c..a10fc2a4cfee7 100644 --- a/cpp/src/parquet/statistics.cc +++ b/cpp/src/parquet/statistics.cc @@ -463,6 +463,7 @@ class TypedStatisticsImpl : public TypedStatistics { public: using T = typename DType::c_type; + // Create an empty stats. TypedStatisticsImpl(const ColumnDescriptor* descr, MemoryPool* pool) : descr_(descr), pool_(pool), @@ -471,10 +472,9 @@ class TypedStatisticsImpl : public TypedStatistics { auto comp = Comparator::Make(descr); comparator_ = std::static_pointer_cast>(comp); TypedStatisticsImpl::Reset(); - has_null_count_ = true; - has_distinct_count_ = true; } + // Create stats from provided values. TypedStatisticsImpl(const T& min, const T& max, int64_t num_values, int64_t null_count, int64_t distinct_count) : pool_(default_memory_pool()), @@ -482,24 +482,29 @@ class TypedStatisticsImpl : public TypedStatistics { max_buffer_(AllocateBuffer(pool_, 0)) { TypedStatisticsImpl::IncrementNumValues(num_values); TypedStatisticsImpl::IncrementNullCount(null_count); - IncrementDistinctCount(distinct_count); + SetDistinctCount(distinct_count); Copy(min, &min_, min_buffer_.get()); Copy(max, &max_, max_buffer_.get()); has_min_max_ = true; } + // Create stats from a thrift Statistics object. TypedStatisticsImpl(const ColumnDescriptor* descr, const std::string& encoded_min, const std::string& encoded_max, int64_t num_values, int64_t null_count, int64_t distinct_count, bool has_min_max, bool has_null_count, bool has_distinct_count, MemoryPool* pool) : TypedStatisticsImpl(descr, pool) { TypedStatisticsImpl::IncrementNumValues(num_values); - if (has_null_count_) { + if (has_null_count) { TypedStatisticsImpl::IncrementNullCount(null_count); + } else { + has_null_count_ = false; } if (has_distinct_count) { - IncrementDistinctCount(distinct_count); + SetDistinctCount(distinct_count); + } else { + has_distinct_count_ = false; } if (!encoded_min.empty()) { @@ -541,9 +546,7 @@ class TypedStatisticsImpl : public TypedStatistics { void Reset() override { ResetCounts(); - has_min_max_ = false; - has_distinct_count_ = false; - has_null_count_ = false; + ResetHasFlags(); } void SetMinMax(const T& arg_min, const T& arg_max) override { @@ -552,12 +555,18 @@ class TypedStatisticsImpl : public TypedStatistics { void Merge(const TypedStatistics& other) override { this->num_values_ += other.num_values(); + // null_count is always valid when merging page statistics into + // column chunk statistics. if (other.HasNullCount()) { this->statistics_.null_count += other.null_count(); + } else { + this->has_null_count_ = false; } - if (other.HasDistinctCount()) { - this->statistics_.distinct_count += other.distinct_count(); - } + // Clear has_distinct_count_ as distinct count cannot be merged. + has_distinct_count_ = false; + // Do not clear min/max here if the other side does not provide + // min/max which may happen when other is an empty stats or all + // its values are null and/or NaN. if (other.HasMinMax()) { SetMinMax(other.min(), other.max()); } @@ -609,9 +618,10 @@ class TypedStatisticsImpl : public TypedStatistics { } if (HasNullCount()) { s.set_null_count(this->null_count()); + // num_values_ is reliable and it means number of non-null values. + s.all_null_value = num_values_ == 0; } - // num_values_ is reliable and it means number of non-null values. - s.all_null_value = num_values_ == 0; + // TODO (GH-36505): distinct count is not encoded for now. return s; } @@ -627,7 +637,12 @@ class TypedStatisticsImpl : public TypedStatistics { T min_; T max_; ::arrow::MemoryPool* pool_; - int64_t num_values_ = 0; // # of non-null values. + // Number of non-null values. + // Please note that num_values_ is reliable when has_null_count_ is set. + // When has_null_count_ is not set, e.g. a page statistics created from + // a statistics thrift message which doesn't have the optional null_count, + // `num_values_` may include null values. + int64_t num_values_ = 0; EncodedStatistics statistics_; std::shared_ptr> comparator_; std::shared_ptr min_buffer_, max_buffer_; @@ -637,8 +652,9 @@ class TypedStatisticsImpl : public TypedStatistics { void Copy(const T& src, T* dst, ResizableBuffer*) { *dst = src; } - void IncrementDistinctCount(int64_t n) { - statistics_.distinct_count += n; + void SetDistinctCount(int64_t n) { + // distinct count can only be "set", and cannot be incremented. + statistics_.distinct_count = n; has_distinct_count_ = true; } @@ -648,6 +664,17 @@ class TypedStatisticsImpl : public TypedStatistics { this->num_values_ = 0; } + void ResetHasFlags() { + // has_min_max_ will only be set when it meets any valid value. + this->has_min_max_ = false; + // has_distinct_count_ will only be set once SetDistinctCount() + // is called because distinct count calculation is not cheap and + // disabled by default. + this->has_distinct_count_ = false; + // Null count calculation is cheap and enabled by default. + this->has_null_count_ = true; + } + void SetMinMaxPair(std::pair min_max) { // CleanStatistic can return a nullopt in case of erroneous values, e.g. NaN auto maybe_min_max = CleanStatistic(min_max); diff --git a/cpp/src/parquet/statistics.h b/cpp/src/parquet/statistics.h index 3f168a938ffbe..ae6c1ca29b2f6 100644 --- a/cpp/src/parquet/statistics.h +++ b/cpp/src/parquet/statistics.h @@ -117,17 +117,16 @@ std::shared_ptr> MakeComparator(const ColumnDescriptor* d // ---------------------------------------------------------------------- /// \brief Structure represented encoded statistics to be written to -/// and from Parquet serialized metadata +/// and read from Parquet serialized metadata. class PARQUET_EXPORT EncodedStatistics { - std::shared_ptr max_, min_; + std::string max_, min_; bool is_signed_ = false; public: - EncodedStatistics() - : max_(std::make_shared()), min_(std::make_shared()) {} + EncodedStatistics() = default; - const std::string& max() const { return *max_; } - const std::string& min() const { return *min_; } + const std::string& max() const { return max_; } + const std::string& min() const { return min_; } int64_t null_count = 0; int64_t distinct_count = 0; @@ -149,11 +148,13 @@ class PARQUET_EXPORT EncodedStatistics { // the true minimum for aggregations and there is no way to mark that a // value has been truncated and is a lower bound and not in the page. void ApplyStatSizeLimits(size_t length) { - if (max_->length() > length) { + if (max_.length() > length) { has_max = false; + max_.clear(); } - if (min_->length() > length) { + if (min_.length() > length) { has_min = false; + min_.clear(); } } @@ -165,14 +166,14 @@ class PARQUET_EXPORT EncodedStatistics { void set_is_signed(bool is_signed) { is_signed_ = is_signed; } - EncodedStatistics& set_max(const std::string& value) { - *max_ = value; + EncodedStatistics& set_max(std::string value) { + max_ = std::move(value); has_max = true; return *this; } - EncodedStatistics& set_min(const std::string& value) { - *min_ = value; + EncodedStatistics& set_min(std::string value) { + min_ = std::move(value); has_min = true; return *this; } @@ -329,7 +330,7 @@ class TypedStatistics : public Statistics { /// null count is determined from the indices) virtual void IncrementNullCount(int64_t n) = 0; - /// \brief Increments the number ov values directly + /// \brief Increments the number of values directly /// The same note on IncrementNullCount applies here virtual void IncrementNumValues(int64_t n) = 0; }; diff --git a/cpp/src/parquet/statistics_test.cc b/cpp/src/parquet/statistics_test.cc index 8b9a42aa18bba..76667f0029b99 100644 --- a/cpp/src/parquet/statistics_test.cc +++ b/cpp/src/parquet/statistics_test.cc @@ -346,6 +346,9 @@ class TestStatistics : public PrimitiveTypedTest { ASSERT_EQ(this->values_.size(), statistics->num_values()); statistics->Reset(); + ASSERT_TRUE(statistics->HasNullCount()); + ASSERT_FALSE(statistics->HasMinMax()); + ASSERT_FALSE(statistics->HasDistinctCount()); ASSERT_EQ(0, statistics->null_count()); ASSERT_EQ(0, statistics->num_values()); ASSERT_EQ(0, statistics->distinct_count()); @@ -590,6 +593,178 @@ TYPED_TEST(TestNumericStatistics, Equals) { ASSERT_NO_FATAL_FAILURE(this->TestEquals()); } +template +class TestStatisticsHasFlag : public TestStatistics { + public: + void SetUp() override { + TestStatistics::SetUp(); + this->SetUpSchema(Repetition::OPTIONAL); + } + + std::shared_ptr> MergedStatistics( + const TypedStatistics& stats1, const TypedStatistics& stats2) { + auto chunk_statistics = MakeStatistics(this->schema_.Column(0)); + chunk_statistics->Merge(stats1); + chunk_statistics->Merge(stats2); + return chunk_statistics; + } + + void VerifyMergedStatistics( + const TypedStatistics& stats1, const TypedStatistics& stats2, + const std::function*)>& test_fn) { + ASSERT_NO_FATAL_FAILURE(test_fn(MergedStatistics(stats1, stats2).get())); + ASSERT_NO_FATAL_FAILURE(test_fn(MergedStatistics(stats2, stats1).get())); + } + + // Distinct count should set to false when Merge is called. + void TestMergeDistinctCount() { + // Create a statistics object with distinct count. + std::shared_ptr> statistics1; + { + EncodedStatistics encoded_statistics1; + statistics1 = std::dynamic_pointer_cast>( + Statistics::Make(this->schema_.Column(0), &encoded_statistics1, + /*num_values=*/1000)); + EXPECT_FALSE(statistics1->HasDistinctCount()); + } + + // Create a statistics object with distinct count. + std::shared_ptr> statistics2; + { + EncodedStatistics encoded_statistics2; + encoded_statistics2.has_distinct_count = true; + encoded_statistics2.distinct_count = 500; + statistics2 = std::dynamic_pointer_cast>( + Statistics::Make(this->schema_.Column(0), &encoded_statistics2, + /*num_values=*/1000)); + EXPECT_TRUE(statistics2->HasDistinctCount()); + } + + VerifyMergedStatistics(*statistics1, *statistics2, + [](TypedStatistics* merged_statistics) { + EXPECT_FALSE(merged_statistics->HasDistinctCount()); + EXPECT_FALSE(merged_statistics->Encode().has_distinct_count); + }); + } + + // If all values in a page are null or nan, its stats should not set min-max. + // Merging its stats with another page having good min-max stats should not + // drop the valid min-max from the latter page. + void TestMergeMinMax() { + this->GenerateData(1000); + // Create a statistics object without min-max. + std::shared_ptr> statistics1; + { + statistics1 = MakeStatistics(this->schema_.Column(0)); + statistics1->Update(this->values_ptr_, /*num_values=*/0, + /*null_count=*/this->values_.size()); + auto encoded_stats1 = statistics1->Encode(); + EXPECT_FALSE(statistics1->HasMinMax()); + EXPECT_FALSE(encoded_stats1.has_min); + EXPECT_FALSE(encoded_stats1.has_max); + } + // Create a statistics object with min-max. + std::shared_ptr> statistics2; + { + statistics2 = MakeStatistics(this->schema_.Column(0)); + statistics2->Update(this->values_ptr_, this->values_.size(), 0); + auto encoded_stats2 = statistics2->Encode(); + EXPECT_TRUE(statistics2->HasMinMax()); + EXPECT_TRUE(encoded_stats2.has_min); + EXPECT_TRUE(encoded_stats2.has_max); + } + VerifyMergedStatistics(*statistics1, *statistics2, + [](TypedStatistics* merged_statistics) { + EXPECT_TRUE(merged_statistics->HasMinMax()); + EXPECT_TRUE(merged_statistics->Encode().has_min); + EXPECT_TRUE(merged_statistics->Encode().has_max); + }); + } + + // Default statistics should have null_count even if no nulls is written. + // However, if statistics is created from thrift message, it might not + // have null_count. Merging statistics from such page will result in an + // invalid null_count as well. + void TestMergeNullCount() { + this->GenerateData(/*num_values=*/1000); + + // Page should have null-count even if no nulls + std::shared_ptr> statistics1; + { + statistics1 = MakeStatistics(this->schema_.Column(0)); + statistics1->Update(this->values_ptr_, /*num_values=*/this->values_.size(), + /*null_count=*/0); + auto encoded_stats1 = statistics1->Encode(); + EXPECT_TRUE(statistics1->HasNullCount()); + EXPECT_EQ(0, statistics1->null_count()); + EXPECT_TRUE(statistics1->Encode().has_null_count); + } + // Merge with null-count should also have null count + VerifyMergedStatistics(*statistics1, *statistics1, + [](TypedStatistics* merged_statistics) { + EXPECT_TRUE(merged_statistics->HasNullCount()); + EXPECT_EQ(0, merged_statistics->null_count()); + auto encoded = merged_statistics->Encode(); + EXPECT_TRUE(encoded.has_null_count); + EXPECT_EQ(0, encoded.null_count); + }); + + // When loaded from thrift, might not have null count. + std::shared_ptr> statistics2; + { + EncodedStatistics encoded_statistics2; + encoded_statistics2.has_null_count = false; + statistics2 = std::dynamic_pointer_cast>( + Statistics::Make(this->schema_.Column(0), &encoded_statistics2, + /*num_values=*/1000)); + EXPECT_FALSE(statistics2->Encode().has_null_count); + EXPECT_FALSE(statistics2->HasNullCount()); + } + + // Merge without null-count should not have null count + VerifyMergedStatistics(*statistics1, *statistics2, + [](TypedStatistics* merged_statistics) { + EXPECT_FALSE(merged_statistics->HasNullCount()); + EXPECT_FALSE(merged_statistics->Encode().has_null_count); + }); + } + + // statistics.all_null_value is used to build the page index. + // If statistics doesn't have null count, all_null_value should be false. + void TestMissingNullCount() { + EncodedStatistics encoded_statistics; + encoded_statistics.has_null_count = false; + auto statistics = Statistics::Make(this->schema_.Column(0), &encoded_statistics, + /*num_values=*/1000); + auto typed_stats = std::dynamic_pointer_cast>(statistics); + EXPECT_FALSE(typed_stats->HasNullCount()); + auto encoded = typed_stats->Encode(); + EXPECT_FALSE(encoded.all_null_value); + EXPECT_FALSE(encoded.has_null_count); + EXPECT_FALSE(encoded.has_distinct_count); + EXPECT_FALSE(encoded.has_min); + EXPECT_FALSE(encoded.has_max); + } +}; + +TYPED_TEST_SUITE(TestStatisticsHasFlag, Types); + +TYPED_TEST(TestStatisticsHasFlag, MergeDistinctCount) { + ASSERT_NO_FATAL_FAILURE(this->TestMergeDistinctCount()); +} + +TYPED_TEST(TestStatisticsHasFlag, MergeNullCount) { + ASSERT_NO_FATAL_FAILURE(this->TestMergeNullCount()); +} + +TYPED_TEST(TestStatisticsHasFlag, MergeMinMax) { + ASSERT_NO_FATAL_FAILURE(this->TestMergeMinMax()); +} + +TYPED_TEST(TestStatisticsHasFlag, MissingNullCount) { + ASSERT_NO_FATAL_FAILURE(this->TestMissingNullCount()); +} + // Helper for basic statistics tests below void AssertStatsSet(const ApplicationVersion& version, std::shared_ptr props, @@ -1212,5 +1387,21 @@ TEST(TestStatisticsSortOrderMinMax, Unsigned) { ASSERT_EQ(0x0b, stats->EncodeMax()[0]); } +TEST(TestEncodedStatistics, CopySafe) { + EncodedStatistics encoded_statistics; + encoded_statistics.set_max("abc"); + encoded_statistics.has_max = true; + + encoded_statistics.set_min("abc"); + encoded_statistics.has_min = true; + + EncodedStatistics copy_statistics = encoded_statistics; + copy_statistics.set_max("abcd"); + copy_statistics.set_min("a"); + + EXPECT_EQ("abc", encoded_statistics.min()); + EXPECT_EQ("abc", encoded_statistics.max()); +} + } // namespace test } // namespace parquet