Fix major bug with prefixes, SeekForPrev, and partitioned filters (#1…

…2872) Summary: Basically, the fix in #8137 was incomplete (and I missed it in the review), because if `whole_key_filtering` is false, then `last_prefix_str_` will never be set to non-empty and the fix doesn't work. Also related to #5835. This is intended as a safe, simple fix that will regress CPU efficiency slightly (for `whole_key_filtering=false` cases, because of extra prefix string copies during flush & compaction). An efficient fix is not possible without some substantial refactoring. Also in this PR: new test DBBloomFilterTest.FilterNumEntriesCoalesce tests an adjacent code path that was previously untested for its effect of ensuring the number of unique prefixes and keys is tracked properly when both prefixes and whole keys are going into a filter. (Test fails when either of the two code segments checking for duplicates is disabled.) In addition, the same test would fail before the main bug fix here because the code would inappropriately add the empty string to the filter (because of unmodified `last_prefix_str_`). Pull Request resolved: #12872 Test Plan: In addition to DBBloomFilterTest.FilterNumEntriesCoalesce, extended DBBloomFilterTest.SeekForPrevWithPartitionedFilters to cover the broken case. (Mostly whitespace change.) Reviewed By: jowlyzhang Differential Revision: D59873793 Pulled By: pdillinger fbshipit-source-id: 2a7b7f09ca73dc188fb4dab833826ad6da7ebb11
facebook · Jul 17, 2024 · 93b163d · 93b163d
1 parent 21db55f
commit 93b163d
Show file tree

Hide file tree

Showing 4 changed files with 117 additions and 45 deletions.
diff --git a/db/db_bloom_filter_test.cc b/db/db_bloom_filter_test.cc
@@ -322,6 +322,67 @@ TEST_F(DBBloomFilterTest, GetFilterByPrefixBloom) {
   }
 }
 
+TEST_F(DBBloomFilterTest, FilterNumEntriesCoalesce) {
+  for (bool partition_filters : {true, false}) {
+    SCOPED_TRACE("partition_filters=" + std::to_string(partition_filters));
+    for (bool prefix : {true, false}) {
+      SCOPED_TRACE("prefix=" + std::to_string(prefix));
+      for (bool whole : {true, false}) {
+        SCOPED_TRACE("whole=" + std::to_string(whole));
+        Options options = last_options_;
+        options.prefix_extractor.reset();
+        if (prefix) {
+          options.prefix_extractor.reset(NewFixedPrefixTransform(3));
+        }
+        BlockBasedTableOptions bbto;
+        bbto.filter_policy.reset(NewBloomFilterPolicy(10));
+        bbto.whole_key_filtering = whole;
+        if (partition_filters) {
+          bbto.partition_filters = true;
+          bbto.index_type =
+              BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch;
+        }
+        options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+        DestroyAndReopen(options);
+
+        // Need a snapshot to allow keeping multiple entries for the same key
+        std::vector<const Snapshot*> snapshots;
+        for (int i = 1; i <= 3; ++i) {
+          std::string val = "val" + std::to_string(i);
+          ASSERT_OK(Put("foo1", val));
+          ASSERT_OK(Put("foo2", val));
+          ASSERT_OK(Put("bar1", val));
+          ASSERT_OK(Put("bar2", val));
+          ASSERT_OK(Put("bar3", val));
+          snapshots.push_back(db_->GetSnapshot());
+        }
+        ASSERT_OK(Flush());
+
+        TablePropertiesCollection tpc;
+        ASSERT_OK(db_->GetPropertiesOfAllTables(&tpc));
+        // sanity checks
+        ASSERT_EQ(tpc.size(), 1U);
+        auto& tp = *tpc.begin()->second;
+        EXPECT_EQ(tp.num_entries, 3U * 5U);
+
+        // test checks
+        unsigned ex_filter_entries = 0;
+        if (whole) {
+          ex_filter_entries += 5;  // unique keys
+        }
+        if (prefix) {
+          ex_filter_entries += 2;  // unique prefixes
+        }
+        EXPECT_EQ(tp.num_filter_entries, ex_filter_entries);
+
+        for (auto* sn : snapshots) {
+          db_->ReleaseSnapshot(sn);
+        }
+      }
+    }
+  }
+}
+
 TEST_F(DBBloomFilterTest, WholeKeyFilterProp) {
   for (bool partition_filters : {true, false}) {
     Options options = last_options_;
@@ -3143,52 +3204,55 @@ TEST_F(DBBloomFilterTest, DynamicBloomFilterOptions) {
 }
 
 TEST_F(DBBloomFilterTest, SeekForPrevWithPartitionedFilters) {
-  Options options = CurrentOptions();
-  constexpr size_t kNumKeys = 10000;
-  static_assert(kNumKeys <= 10000, "kNumKeys have to be <= 10000");
-  options.memtable_factory.reset(
-      test::NewSpecialSkipListFactory(kNumKeys + 10));
-  options.create_if_missing = true;
-  constexpr size_t kPrefixLength = 4;
-  options.prefix_extractor.reset(NewFixedPrefixTransform(kPrefixLength));
-  options.compression = kNoCompression;
-  BlockBasedTableOptions bbto;
-  bbto.filter_policy.reset(NewBloomFilterPolicy(50));
-  bbto.index_shortening =
-      BlockBasedTableOptions::IndexShorteningMode::kNoShortening;
-  bbto.block_size = 128;
-  bbto.metadata_block_size = 128;
-  bbto.partition_filters = true;
-  bbto.index_type = BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch;
-  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
-  DestroyAndReopen(options);
+  for (bool wkf : {true, false}) {
+    SCOPED_TRACE("whole_key_filtering=" + std::to_string(wkf));
+    Options options = CurrentOptions();
+    constexpr size_t kNumKeys = 10000;
+    static_assert(kNumKeys <= 10000, "kNumKeys have to be <= 10000");
+    options.memtable_factory.reset(
+        test::NewSpecialSkipListFactory(kNumKeys + 10));
+    options.create_if_missing = true;
+    constexpr size_t kPrefixLength = 4;
+    options.prefix_extractor.reset(NewFixedPrefixTransform(kPrefixLength));
+    options.compression = kNoCompression;
+    BlockBasedTableOptions bbto;
+    bbto.filter_policy.reset(NewBloomFilterPolicy(50));
+    bbto.index_shortening =
+        BlockBasedTableOptions::IndexShorteningMode::kNoShortening;
+    bbto.block_size = 128;
+    bbto.metadata_block_size = 128;
+    bbto.partition_filters = true;
+    bbto.index_type = BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch;
+    bbto.whole_key_filtering = wkf;
+    options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+    DestroyAndReopen(options);
 
-  const std::string value(64, '\0');
+    const std::string value(64, '\0');
 
-  WriteOptions write_opts;
-  write_opts.disableWAL = true;
-  for (size_t i = 0; i < kNumKeys; ++i) {
-    std::ostringstream oss;
-    oss << std::setfill('0') << std::setw(4) << std::fixed << i;
-    ASSERT_OK(db_->Put(write_opts, oss.str(), value));
-  }
-  ASSERT_OK(Flush());
+    WriteOptions write_opts;
+    write_opts.disableWAL = true;
+    for (size_t i = 0; i < kNumKeys; ++i) {
+      std::ostringstream oss;
+      oss << std::setfill('0') << std::setw(4) << std::fixed << i;
+      ASSERT_OK(db_->Put(write_opts, oss.str(), value));
+    }
+    ASSERT_OK(Flush());
 
-  ReadOptions read_opts;
-  // Use legacy, implicit prefix seek
-  read_opts.total_order_seek = false;
-  read_opts.auto_prefix_mode = false;
-  std::unique_ptr<Iterator> it(db_->NewIterator(read_opts));
-  for (size_t i = 0; i < kNumKeys; ++i) {
-    // Seek with a key after each one added but with same prefix. One will
-    // surely cross a partition boundary.
-    std::ostringstream oss;
-    oss << std::setfill('0') << std::setw(4) << std::fixed << i << "a";
-    it->SeekForPrev(oss.str());
-    ASSERT_OK(it->status());
-    ASSERT_TRUE(it->Valid());
-  }
-  it.reset();
+    ReadOptions read_opts;
+    // Use legacy, implicit prefix seek
+    read_opts.total_order_seek = false;
+    read_opts.auto_prefix_mode = false;
+    std::unique_ptr<Iterator> it(db_->NewIterator(read_opts));
+    for (size_t i = 0; i < kNumKeys; ++i) {
+      // Seek with a key after each one added but with same prefix. One will
+      // surely cross a partition boundary.
+      std::ostringstream oss;
+      oss << std::setfill('0') << std::setw(4) << std::fixed << i << "a";
+      it->SeekForPrev(oss.str());
+      ASSERT_OK(it->status());
+      ASSERT_TRUE(it->Valid());
+    }
+  }
 }
 
 namespace {

diff --git a/table/block_based/full_filter_block.cc b/table/block_based/full_filter_block.cc
@@ -82,7 +82,14 @@ inline void FullFilterBlockBuilder::AddKey(const Slice& key) {
 void FullFilterBlockBuilder::AddPrefix(const Slice& key) {
   assert(prefix_extractor_ && prefix_extractor_->InDomain(key));
   Slice prefix = prefix_extractor_->Transform(key);
-  if (whole_key_filtering_) {
+  if (true /*WART*/ || whole_key_filtering_) {
+    // WART/FIXME: Because last_prefix_str_ is needed above to make
+    // SeekForPrev work, we are currently using this inefficient code always.
+    // Hopefully this can be optimized with some refactoring up the call chain
+    // to BlockBasedTableBuilder. Even in PartitionedFilterBlockBuilder, we
+    // don't currently have access to the previous key/prefix by the time we
+    // know we are starting a new partition.
+
     // if both whole_key and prefix are added to bloom then we will have whole
     // key and prefix addition being interleaved and thus cannot rely on the
     // bits builder to properly detect the duplicates by comparing with the last

diff --git a/table/block_based/partitioned_filter_block.cc b/table/block_based/partitioned_filter_block.cc
@@ -86,7 +86,7 @@ void PartitionedFilterBlockBuilder::MaybeCutAFilterBlock(
   }
 
   // Add the prefix of the next key before finishing the partition without
-  // updating last_prefix_str_. This hack, fixes a bug with format_verison=3
+  // updating last_prefix_str_. This hack fixes a bug with format_verison=3
   // where seeking for the prefix would lead us to the previous partition.
   const bool maybe_add_prefix =
       next_key && prefix_extractor() && prefix_extractor()->InDomain(*next_key);

diff --git a/unreleased_history/bug_fixes/0seek_for_prev_prefix.md b/unreleased_history/bug_fixes/0seek_for_prev_prefix.md
@@ -0,0 +1 @@
+* Fix a major bug in which an iterator using prefix filtering and SeekForPrev might miss data when the DB is using `whole_key_filtering=false` and `partition_filters=true`.