Skip to content

Commit

Permalink
Fix major bug with prefixes, SeekForPrev, and partitioned filters (#1…
Browse files Browse the repository at this point in the history
…2872)

Summary:
Basically, the fix in #8137 was incomplete (and I missed it in the review), because if `whole_key_filtering` is false, then `last_prefix_str_` will never be set to non-empty and the fix doesn't work. Also related to #5835.

This is intended as a safe, simple fix that will regress CPU efficiency slightly (for `whole_key_filtering=false` cases, because of extra prefix string copies during flush & compaction). An efficient fix is not possible without some substantial refactoring.

Also in this PR: new test DBBloomFilterTest.FilterNumEntriesCoalesce tests an adjacent code path that was previously untested for its effect of ensuring the number of unique prefixes and keys is tracked properly when both prefixes and whole keys are going into a filter. (Test fails when either of the two code segments checking for duplicates is disabled.) In addition, the same test would fail before the main bug fix here because the code would inappropriately add the empty string to the filter (because of unmodified `last_prefix_str_`).

Pull Request resolved: #12872

Test Plan: In addition to DBBloomFilterTest.FilterNumEntriesCoalesce, extended DBBloomFilterTest.SeekForPrevWithPartitionedFilters to cover the broken case. (Mostly whitespace change.)

Reviewed By: jowlyzhang

Differential Revision: D59873793

Pulled By: pdillinger

fbshipit-source-id: 2a7b7f09ca73dc188fb4dab833826ad6da7ebb11
  • Loading branch information
pdillinger authored and facebook-github-bot committed Jul 17, 2024
1 parent 21db55f commit 93b163d
Show file tree
Hide file tree
Showing 4 changed files with 117 additions and 45 deletions.
150 changes: 107 additions & 43 deletions db/db_bloom_filter_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -322,6 +322,67 @@ TEST_F(DBBloomFilterTest, GetFilterByPrefixBloom) {
}
}

TEST_F(DBBloomFilterTest, FilterNumEntriesCoalesce) {
for (bool partition_filters : {true, false}) {
SCOPED_TRACE("partition_filters=" + std::to_string(partition_filters));
for (bool prefix : {true, false}) {
SCOPED_TRACE("prefix=" + std::to_string(prefix));
for (bool whole : {true, false}) {
SCOPED_TRACE("whole=" + std::to_string(whole));
Options options = last_options_;
options.prefix_extractor.reset();
if (prefix) {
options.prefix_extractor.reset(NewFixedPrefixTransform(3));
}
BlockBasedTableOptions bbto;
bbto.filter_policy.reset(NewBloomFilterPolicy(10));
bbto.whole_key_filtering = whole;
if (partition_filters) {
bbto.partition_filters = true;
bbto.index_type =
BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch;
}
options.table_factory.reset(NewBlockBasedTableFactory(bbto));
DestroyAndReopen(options);

// Need a snapshot to allow keeping multiple entries for the same key
std::vector<const Snapshot*> snapshots;
for (int i = 1; i <= 3; ++i) {
std::string val = "val" + std::to_string(i);
ASSERT_OK(Put("foo1", val));
ASSERT_OK(Put("foo2", val));
ASSERT_OK(Put("bar1", val));
ASSERT_OK(Put("bar2", val));
ASSERT_OK(Put("bar3", val));
snapshots.push_back(db_->GetSnapshot());
}
ASSERT_OK(Flush());

TablePropertiesCollection tpc;
ASSERT_OK(db_->GetPropertiesOfAllTables(&tpc));
// sanity checks
ASSERT_EQ(tpc.size(), 1U);
auto& tp = *tpc.begin()->second;
EXPECT_EQ(tp.num_entries, 3U * 5U);

// test checks
unsigned ex_filter_entries = 0;
if (whole) {
ex_filter_entries += 5; // unique keys
}
if (prefix) {
ex_filter_entries += 2; // unique prefixes
}
EXPECT_EQ(tp.num_filter_entries, ex_filter_entries);

for (auto* sn : snapshots) {
db_->ReleaseSnapshot(sn);
}
}
}
}
}

TEST_F(DBBloomFilterTest, WholeKeyFilterProp) {
for (bool partition_filters : {true, false}) {
Options options = last_options_;
Expand Down Expand Up @@ -3143,52 +3204,55 @@ TEST_F(DBBloomFilterTest, DynamicBloomFilterOptions) {
}

TEST_F(DBBloomFilterTest, SeekForPrevWithPartitionedFilters) {
Options options = CurrentOptions();
constexpr size_t kNumKeys = 10000;
static_assert(kNumKeys <= 10000, "kNumKeys have to be <= 10000");
options.memtable_factory.reset(
test::NewSpecialSkipListFactory(kNumKeys + 10));
options.create_if_missing = true;
constexpr size_t kPrefixLength = 4;
options.prefix_extractor.reset(NewFixedPrefixTransform(kPrefixLength));
options.compression = kNoCompression;
BlockBasedTableOptions bbto;
bbto.filter_policy.reset(NewBloomFilterPolicy(50));
bbto.index_shortening =
BlockBasedTableOptions::IndexShorteningMode::kNoShortening;
bbto.block_size = 128;
bbto.metadata_block_size = 128;
bbto.partition_filters = true;
bbto.index_type = BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch;
options.table_factory.reset(NewBlockBasedTableFactory(bbto));
DestroyAndReopen(options);
for (bool wkf : {true, false}) {
SCOPED_TRACE("whole_key_filtering=" + std::to_string(wkf));
Options options = CurrentOptions();
constexpr size_t kNumKeys = 10000;
static_assert(kNumKeys <= 10000, "kNumKeys have to be <= 10000");
options.memtable_factory.reset(
test::NewSpecialSkipListFactory(kNumKeys + 10));
options.create_if_missing = true;
constexpr size_t kPrefixLength = 4;
options.prefix_extractor.reset(NewFixedPrefixTransform(kPrefixLength));
options.compression = kNoCompression;
BlockBasedTableOptions bbto;
bbto.filter_policy.reset(NewBloomFilterPolicy(50));
bbto.index_shortening =
BlockBasedTableOptions::IndexShorteningMode::kNoShortening;
bbto.block_size = 128;
bbto.metadata_block_size = 128;
bbto.partition_filters = true;
bbto.index_type = BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch;
bbto.whole_key_filtering = wkf;
options.table_factory.reset(NewBlockBasedTableFactory(bbto));
DestroyAndReopen(options);

const std::string value(64, '\0');
const std::string value(64, '\0');

WriteOptions write_opts;
write_opts.disableWAL = true;
for (size_t i = 0; i < kNumKeys; ++i) {
std::ostringstream oss;
oss << std::setfill('0') << std::setw(4) << std::fixed << i;
ASSERT_OK(db_->Put(write_opts, oss.str(), value));
}
ASSERT_OK(Flush());
WriteOptions write_opts;
write_opts.disableWAL = true;
for (size_t i = 0; i < kNumKeys; ++i) {
std::ostringstream oss;
oss << std::setfill('0') << std::setw(4) << std::fixed << i;
ASSERT_OK(db_->Put(write_opts, oss.str(), value));
}
ASSERT_OK(Flush());

ReadOptions read_opts;
// Use legacy, implicit prefix seek
read_opts.total_order_seek = false;
read_opts.auto_prefix_mode = false;
std::unique_ptr<Iterator> it(db_->NewIterator(read_opts));
for (size_t i = 0; i < kNumKeys; ++i) {
// Seek with a key after each one added but with same prefix. One will
// surely cross a partition boundary.
std::ostringstream oss;
oss << std::setfill('0') << std::setw(4) << std::fixed << i << "a";
it->SeekForPrev(oss.str());
ASSERT_OK(it->status());
ASSERT_TRUE(it->Valid());
}
it.reset();
ReadOptions read_opts;
// Use legacy, implicit prefix seek
read_opts.total_order_seek = false;
read_opts.auto_prefix_mode = false;
std::unique_ptr<Iterator> it(db_->NewIterator(read_opts));
for (size_t i = 0; i < kNumKeys; ++i) {
// Seek with a key after each one added but with same prefix. One will
// surely cross a partition boundary.
std::ostringstream oss;
oss << std::setfill('0') << std::setw(4) << std::fixed << i << "a";
it->SeekForPrev(oss.str());
ASSERT_OK(it->status());
ASSERT_TRUE(it->Valid());
}
}
}

namespace {
Expand Down
9 changes: 8 additions & 1 deletion table/block_based/full_filter_block.cc
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,14 @@ inline void FullFilterBlockBuilder::AddKey(const Slice& key) {
void FullFilterBlockBuilder::AddPrefix(const Slice& key) {
assert(prefix_extractor_ && prefix_extractor_->InDomain(key));
Slice prefix = prefix_extractor_->Transform(key);
if (whole_key_filtering_) {
if (true /*WART*/ || whole_key_filtering_) {
// WART/FIXME: Because last_prefix_str_ is needed above to make
// SeekForPrev work, we are currently using this inefficient code always.
// Hopefully this can be optimized with some refactoring up the call chain
// to BlockBasedTableBuilder. Even in PartitionedFilterBlockBuilder, we
// don't currently have access to the previous key/prefix by the time we
// know we are starting a new partition.

// if both whole_key and prefix are added to bloom then we will have whole
// key and prefix addition being interleaved and thus cannot rely on the
// bits builder to properly detect the duplicates by comparing with the last
Expand Down
2 changes: 1 addition & 1 deletion table/block_based/partitioned_filter_block.cc
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ void PartitionedFilterBlockBuilder::MaybeCutAFilterBlock(
}

// Add the prefix of the next key before finishing the partition without
// updating last_prefix_str_. This hack, fixes a bug with format_verison=3
// updating last_prefix_str_. This hack fixes a bug with format_verison=3
// where seeking for the prefix would lead us to the previous partition.
const bool maybe_add_prefix =
next_key && prefix_extractor() && prefix_extractor()->InDomain(*next_key);
Expand Down
1 change: 1 addition & 0 deletions unreleased_history/bug_fixes/0seek_for_prev_prefix.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
* Fix a major bug in which an iterator using prefix filtering and SeekForPrev might miss data when the DB is using `whole_key_filtering=false` and `partition_filters=true`.

0 comments on commit 93b163d

Please sign in to comment.