Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix wrong smallest key of delete range tombstones #2799

Closed
wants to merge 4 commits into from
Closed
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
34 changes: 34 additions & 0 deletions db/db_range_del_test.cc
Expand Up @@ -962,6 +962,40 @@ TEST_F(DBRangeDelTest, CompactionTreatsSplitInputLevelDeletionAtomically) {
}
}

TEST_F(DBRangeDelTest, UnorderedTombstones) {
// Regression test for #2752. Range delete tombstones between
// different snapshot stripes are not stored in order, so the first
// tombstone of each snapshot stripe should be checked as a smallest
// candidate.
Options options = CurrentOptions();
DestroyAndReopen(options);

auto cf = db_->DefaultColumnFamily();

ASSERT_OK(db_->Put(WriteOptions(), cf, "a", "a"));
ASSERT_OK(db_->Flush(FlushOptions(), cf));
ASSERT_EQ(1, NumTableFilesAtLevel(0));
ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr));
ASSERT_EQ(1, NumTableFilesAtLevel(1));

ASSERT_OK(db_->DeleteRange(WriteOptions(), cf, "b", "c"));
// Hold a snapshot to separate these two delete ranges.
auto snapshot = db_->GetSnapshot();
ASSERT_OK(db_->DeleteRange(WriteOptions(), cf, "a", "b"));
ASSERT_OK(db_->Flush(FlushOptions(), cf));
db_->ReleaseSnapshot(snapshot);

std::vector<std::vector<FileMetaData>> files;
dbfull()->TEST_GetFilesMetaData(cf, &files);
ASSERT_EQ(1, files[0].size());
ASSERT_EQ("a", files[0][0].smallest.user_key());
ASSERT_EQ("c", files[0][0].largest.user_key());

std::string v;
auto s = db_->Get(ReadOptions(), "a", &v);
ASSERT_TRUE(s.IsNotFound());
}

#endif // ROCKSDB_LITE

} // namespace rocksdb
Expand Down
38 changes: 21 additions & 17 deletions db/range_del_aggregator.cc
Expand Up @@ -413,6 +413,7 @@ void RangeDelAggregator::AddToBuilder(
// Note the order in which tombstones are stored is insignificant since we
// insert them into a std::map on the read path.
while (stripe_map_iter != rep_->stripe_map_.end()) {
bool first_added = false;
for (auto tombstone_map_iter = stripe_map_iter->second.raw_map.begin();
tombstone_map_iter != stripe_map_iter->second.raw_map.end();
++tombstone_map_iter) {
Expand Down Expand Up @@ -449,23 +450,26 @@ void RangeDelAggregator::AddToBuilder(

auto ikey_and_end_key = tombstone.Serialize();
builder->Add(ikey_and_end_key.first.Encode(), ikey_and_end_key.second);
InternalKey smallest_candidate = std::move(ikey_and_end_key.first);
if (lower_bound != nullptr &&
icmp_.user_comparator()->Compare(smallest_candidate.user_key(),
*lower_bound) <= 0) {
// Pretend the smallest key has the same user key as lower_bound
// (the max key in the previous table or subcompaction) in order for
// files to appear key-space partitioned.
//
// Choose lowest seqnum so this file's smallest internal key comes
// after the previous file's/subcompaction's largest. The fake seqnum
// is OK because the read path's file-picking code only considers user
// key.
smallest_candidate = InternalKey(*lower_bound, 0, kTypeRangeDeletion);
}
if (meta->smallest.size() == 0 ||
icmp_.Compare(smallest_candidate, meta->smallest) < 0) {
meta->smallest = std::move(smallest_candidate);
if (!first_added) {
first_added = true;
InternalKey smallest_candidate = std::move(ikey_and_end_key.first);
if (lower_bound != nullptr &&
icmp_.user_comparator()->Compare(smallest_candidate.user_key(),
*lower_bound) <= 0) {
// Pretend the smallest key has the same user key as lower_bound
// (the max key in the previous table or subcompaction) in order for
// files to appear key-space partitioned.
//
// Choose lowest seqnum so this file's smallest internal key comes
// after the previous file's/subcompaction's largest. The fake seqnum
// is OK because the read path's file-picking code only considers user
// key.
smallest_candidate = InternalKey(*lower_bound, 0, kTypeRangeDeletion);
}
if (meta->smallest.size() == 0 ||
icmp_.Compare(smallest_candidate, meta->smallest) < 0) {
meta->smallest = std::move(smallest_candidate);
}
}
InternalKey largest_candidate = tombstone.SerializeEndKey();
if (upper_bound != nullptr &&
Expand Down
32 changes: 0 additions & 32 deletions db/range_del_aggregator_test.cc
Expand Up @@ -149,38 +149,6 @@ TEST_F(RangeDelAggregatorTest, AlternateMultipleAboveBelow) {
{"h", 0}});
}

TEST_F(RangeDelAggregatorTest, UnorderedTombstones) {
// See more details in https://github.com/facebook/rocksdb/issues/2752.
Options opts;
opts.create_if_missing = true;
opts.error_if_exists = true;
ReadOptions ropts;
WriteOptions wopts;
FlushOptions fopts;
CompactRangeOptions copts;

std::string name = test::TmpDir() + "/UnorderedTombstones";

DB* db = nullptr;
ASSERT_OK(DB::Open(opts, name, &db));

auto cf = db->DefaultColumnFamily();

ASSERT_OK(db->Put(wopts, cf, Slice("a"), Slice("a")));
ASSERT_OK(db->Flush(fopts, cf));
ASSERT_OK(db->CompactRange(copts, cf, nullptr, nullptr));

ASSERT_OK(db->DeleteRange(wopts, cf, Slice("b"), Slice("c")));
// Hold a snapshot to separate these two delete ranges.
auto sn = db->GetSnapshot();
ASSERT_OK(db->DeleteRange(wopts, cf, Slice("a"), Slice("b")));
ASSERT_OK(db->Flush(fopts, cf));
db->ReleaseSnapshot(sn);

std::string v;
auto s = db->Get(ropts, Slice("a"), &v);
ASSERT_TRUE(s.IsNotFound());
}
} // namespace rocksdb

int main(int argc, char** argv) {
Expand Down