Skip to content

Commit

Permalink
Improve performance the mix of range-delete and read workloads.
Browse files Browse the repository at this point in the history
The idea is to force flush after few range-delete operations.
Experimentation shows 32 or 64 giving good performance.

memtable_max_range_deletions is available as a ColumnFamilyOption.
The default is 0, which disables. it.
  • Loading branch information
vrdhn committed May 24, 2023
1 parent 28bf7ba commit a1e333c
Show file tree
Hide file tree
Showing 15 changed files with 161 additions and 4 deletions.
21 changes: 20 additions & 1 deletion db/memtable.cc
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,7 @@ MemTable::MemTable(const InternalKeyComparator& cmp,
data_size_(0),
num_entries_(0),
num_deletes_(0),
num_range_deletes_(0),
write_buffer_size_(mutable_cf_options.write_buffer_size),
flush_in_progress_(false),
flush_completed_(false),
Expand All @@ -114,7 +115,9 @@ MemTable::MemTable(const InternalKeyComparator& cmp,
ioptions.memtable_insert_with_hint_prefix_extractor.get()),
oldest_key_time_(std::numeric_limits<uint64_t>::max()),
atomic_flush_seqno_(kMaxSequenceNumber),
approximate_memory_usage_(0) {
approximate_memory_usage_(0),
memtable_max_range_deletions_(
mutable_cf_options.memtable_max_range_deletions) {
UpdateFlushState();
// something went wrong if we need to flush before inserting anything
assert(!ShouldScheduleFlush());
Expand Down Expand Up @@ -170,6 +173,12 @@ size_t MemTable::ApproximateMemoryUsage() {
}

bool MemTable::ShouldFlushNow() {
// This is set if memtable_max_range_deletions is > 0,
// and that many range deletions are done
if (memtable_max_range_deletions_reached_) {
return true;
}

size_t write_buffer_size = write_buffer_size_.load(std::memory_order_relaxed);
// In a lot of times, we cannot allocate arena blocks that exactly matches the
// buffer size. Thus we have to decide if we should over-allocate or
Expand Down Expand Up @@ -754,6 +763,15 @@ Status MemTable::Add(SequenceNumber s, ValueType type,
type == kTypeDeletionWithTimestamp) {
num_deletes_.store(num_deletes_.load(std::memory_order_relaxed) + 1,
std::memory_order_relaxed);
} else if (type == kTypeRangeDeletion) {
uint64_t val = num_range_deletes_.load(std::memory_order_relaxed) + 1;
num_range_deletes_.store(val, std::memory_order_relaxed);
// Arrange for a flush if too many delete ranges have been issued.
if (memtable_max_range_deletions_ > 0 &&
memtable_max_range_deletions_reached_ == false &&
val > (uint64_t)memtable_max_range_deletions_) {
memtable_max_range_deletions_reached_ = true;
}
}

if (bloom_filter_ && prefix_extractor_ &&
Expand Down Expand Up @@ -1263,6 +1281,7 @@ bool MemTable::Get(const LookupKey& key, std::string* value,
// Avoiding recording stats for speed.
return false;
}

PERF_TIMER_GUARD(get_from_memtable_time);

std::unique_ptr<FragmentedRangeTombstoneIterator> range_del_iter(
Expand Down
8 changes: 8 additions & 0 deletions db/memtable.h
Original file line number Diff line number Diff line change
Expand Up @@ -553,6 +553,7 @@ class MemTable {
std::atomic<uint64_t> data_size_;
std::atomic<uint64_t> num_entries_;
std::atomic<uint64_t> num_deletes_;
std::atomic<uint64_t> num_range_deletes_;

// Dynamically changeable memtable option
std::atomic<size_t> write_buffer_size_;
Expand Down Expand Up @@ -614,6 +615,13 @@ class MemTable {
// Gets refreshed inside `ApproximateMemoryUsage()` or `ShouldFlushNow`
std::atomic<uint64_t> approximate_memory_usage_;

// max range deletions in a memtable, before automatic flushing, 0 for
// unlimited.
uint32_t memtable_max_range_deletions_ = 0;
// Range-delete will turn this flag on, if memtable_max_range_deletions_ is
// reached. ShouldFlushNow() will check this.
bool memtable_max_range_deletions_reached_ = false;

// Flush job info of the current memtable.
std::unique_ptr<FlushJobInfo> flush_job_info_;

Expand Down
5 changes: 5 additions & 0 deletions include/rocksdb/options.h
Original file line number Diff line number Diff line change
Expand Up @@ -331,6 +331,11 @@ struct ColumnFamilyOptions : public AdvancedColumnFamilyOptions {
// Default: nullptr
std::shared_ptr<SstPartitionerFactory> sst_partitioner_factory = nullptr;

// Automatic flush after range deletions count in memtable hits this limit.
// helps with workloads having lot of range deletes.
// 0 to disable it completely
uint32_t memtable_max_range_deletions = 0;

// Create ColumnFamilyOptions with default values for all fields
ColumnFamilyOptions();
// Create ColumnFamilyOptions from Options
Expand Down
46 changes: 46 additions & 0 deletions java/rocksjni/options.cc
Original file line number Diff line number Diff line change
Expand Up @@ -3904,6 +3904,28 @@ jbyte Java_org_rocksdb_Options_prepopulateBlobCache(JNIEnv*, jobject,
opts->prepopulate_blob_cache);
}

/*
* Class: org_rocksdb_Options
* Method: setMemtableMaxRangeDeletions
* Signature: (JI)V
*/
void Java_org_rocksdb_Options_setMemtableMaxRangeDeletions(
JNIEnv*, jobject, jlong jhandle, jint jmemtable_max_range_deletions) {
auto* opts = reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle);
opts->memtable_max_range_deletions = jmemtable_max_range_deletions;
}

/*
* Class: org_rocksdb_Options
* Method: memtableMaxRangeDeletions
* Signature: (J)I
*/
jint Java_org_rocksdb_Options_memtableMaxRangeDeletions(JNIEnv*, jobject,
jlong jhandle) {
auto* opts = reinterpret_cast<ROCKSDB_NAMESPACE::Options*>(jhandle);
return static_cast<jint>(opts->memtable_max_range_deletions);
}

//////////////////////////////////////////////////////////////////////////////
// ROCKSDB_NAMESPACE::ColumnFamilyOptions

Expand Down Expand Up @@ -5770,6 +5792,30 @@ jbyte Java_org_rocksdb_ColumnFamilyOptions_prepopulateBlobCache(JNIEnv*,
opts->prepopulate_blob_cache);
}

/*
* Class: org_rocksdb_ColumnFamilyOptions
* Method: setMemtableMaxRangeDeletions
* Signature: (JI)V
*/
void Java_org_rocksdb_ColumnFamilyOptions_setMemtableMaxRangeDeletions(
JNIEnv*, jobject, jlong jhandle, jint jmemtable_max_range_deletions) {
auto* opts =
reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyOptions*>(jhandle);
opts->memtable_max_range_deletions = jmemtable_max_range_deletions;
}

/*
* Class: org_rocksdb_ColumnFamilyOptions
* Method: memtableMaxRangeDeletions
* Signature: (J)I
*/
jint Java_org_rocksdb_ColumnFamilyOptions_memtableMaxRangeDeletions(
JNIEnv*, jobject, jlong jhandle) {
auto* opts =
reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyOptions*>(jhandle);
return static_cast<jint>(opts->memtable_max_range_deletions);
}

/////////////////////////////////////////////////////////////////////
// ROCKSDB_NAMESPACE::DBOptions

Expand Down
13 changes: 13 additions & 0 deletions java/src/main/java/org/rocksdb/ColumnFamilyOptions.java
Original file line number Diff line number Diff line change
Expand Up @@ -959,6 +959,17 @@ public SstPartitionerFactory sstPartitionerFactory() {
return sstPartitionerFactory_;
}

@Override
public ColumnFamilyOptions setMemtableMaxRangeDeletions(final int count) {
setMemtableMaxRangeDeletions(nativeHandle_, count);
return this;
}

@Override
public int memtableMaxRangeDeletions() {
return memtableMaxRangeDeletions(nativeHandle_);
}

//
// BEGIN options for blobs (integrated BlobDB)
//
Expand Down Expand Up @@ -1498,6 +1509,8 @@ private native void setForceConsistencyChecks(final long handle,
private native void setSstPartitionerFactory(long nativeHandle_, long newFactoryHandle);
private static native void setCompactionThreadLimiter(
final long nativeHandle_, final long compactionThreadLimiterHandle);
private native void setMemtableMaxRangeDeletions(final long handle, final int count);
private native int memtableMaxRangeDeletions(final long handle);

private native void setEnableBlobFiles(final long nativeHandle_, final boolean enableBlobFiles);
private native boolean enableBlobFiles(final long nativeHandle_);
Expand Down
17 changes: 17 additions & 0 deletions java/src/main/java/org/rocksdb/ColumnFamilyOptionsInterface.java
Original file line number Diff line number Diff line change
Expand Up @@ -506,6 +506,23 @@ T setCompressionOptions(
@Experimental("Caution: this option is experimental")
SstPartitionerFactory sstPartitionerFactory();

/**
* Sets the maximum range delete calls, after which memtable is flushed.
* This applies to the mutable memtable.
*
* @param a positive integer, 0 (default) to disable the feature.
* @return the reference of the current options.
*/
T setMemtableMaxRangeDeletions(final int count);

/**
* Gets the current setting of maximum range deletes allowed
* 0(default) indicates that feature is disabled.
*
* @return current value of memtable_max_range_deletions
*/
int memtableMaxRangeDeletions();

/**
* Compaction concurrent thread limiter for the column family.
* If non-nullptr, use given concurrent thread limiter to control
Expand Down
13 changes: 13 additions & 0 deletions java/src/main/java/org/rocksdb/Options.java
Original file line number Diff line number Diff line change
Expand Up @@ -1984,6 +1984,17 @@ public SstPartitionerFactory sstPartitionerFactory() {
return sstPartitionerFactory_;
}

@Override
public Options setMemtableMaxRangeDeletions(final int count) {
setMemtableMaxRangeDeletions(nativeHandle_, count);
return this;
}

@Override
public int memtableMaxRangeDeletions() {
return memtableMaxRangeDeletions(nativeHandle_);
}

@Override
public Options setCompactionThreadLimiter(final ConcurrentTaskLimiter compactionThreadLimiter) {
setCompactionThreadLimiter(nativeHandle_, compactionThreadLimiter.nativeHandle_);
Expand Down Expand Up @@ -2502,6 +2513,8 @@ private native void setAtomicFlush(final long handle,
final boolean atomicFlush);
private native boolean atomicFlush(final long handle);
private native void setSstPartitionerFactory(long nativeHandle_, long newFactoryHandle);
private native void setMemtableMaxRangeDeletions(final long handle, final int count);
private native int memtableMaxRangeDeletions(final long handle);
private static native void setCompactionThreadLimiter(
final long nativeHandle_, final long newLimiterHandle);
private static native void setAvoidUnnecessaryBlockingIO(
Expand Down
10 changes: 10 additions & 0 deletions java/src/test/java/org/rocksdb/ColumnFamilyOptionsTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -709,4 +709,14 @@ public void cfPaths() throws IOException {
assertThat(options.cfPaths()).isEqualTo(paths);
}
}

@Test
public void memtableMaxRangeDeletions() {
try (final ColumnFamilyOptions options = new ColumnFamilyOptions()) {
assertThat(options.memtableMaxRangeDeletions()).isEqualTo(0);
final int val = 32;
assertThat(options.setMemtableMaxRangeDeletions(val)).isEqualTo(options);
assertThat(options.memtableMaxRangeDeletions()).isEqualTo(val);
}
}
}
10 changes: 10 additions & 0 deletions java/src/test/java/org/rocksdb/OptionsTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -1452,6 +1452,16 @@ public void skipCheckingSstFileSizesOnDbOpen() {
}
}

@Test
public void memtableMaxRangeDeletions() {
try (final Options options = new Options()) {
assertThat(options.memtableMaxRangeDeletions()).isEqualTo(0);
final int val = 32;
assertThat(options.setMemtableMaxRangeDeletions(val)).isEqualTo(options);
assertThat(options.memtableMaxRangeDeletions()).isEqualTo(val);
}
}

@Test
public void eventListeners() {
final AtomicBoolean wasCalled1 = new AtomicBoolean();
Expand Down
5 changes: 5 additions & 0 deletions options/cf_options.cc
Original file line number Diff line number Diff line change
Expand Up @@ -552,6 +552,11 @@ static std::unordered_map<std::string, OptionTypeInfo>
}
})},
// End special case properties
{"memtable_max_range_deletions",
{offsetof(struct MutableCFOptions, memtable_max_range_deletions),
OptionType::kUInt32T, OptionVerificationType::kNormal,
OptionTypeFlags::kNone}},

};

static std::unordered_map<std::string, OptionTypeInfo>
Expand Down
7 changes: 5 additions & 2 deletions options/cf_options.h
Original file line number Diff line number Diff line change
Expand Up @@ -175,7 +175,8 @@ struct MutableCFOptions {
block_protection_bytes_per_key(options.block_protection_bytes_per_key),
sample_for_compression(
options.sample_for_compression), // TODO: is 0 fine here?
compression_per_level(options.compression_per_level) {
compression_per_level(options.compression_per_level),
memtable_max_range_deletions(options.memtable_max_range_deletions) {
RefreshDerivedOptions(options.num_levels, options.compaction_style);
}

Expand Down Expand Up @@ -224,7 +225,8 @@ struct MutableCFOptions {
last_level_temperature(Temperature::kUnknown),
memtable_protection_bytes_per_key(0),
block_protection_bytes_per_key(0),
sample_for_compression(0) {}
sample_for_compression(0),
memtable_max_range_deletions(0) {}

explicit MutableCFOptions(const Options& options);

Expand Down Expand Up @@ -318,6 +320,7 @@ struct MutableCFOptions {

uint64_t sample_for_compression;
std::vector<CompressionType> compression_per_level;
uint32_t memtable_max_range_deletions;

// Derived options
// Per-level target file size.
Expand Down
2 changes: 2 additions & 0 deletions options/options.cc
Original file line number Diff line number Diff line change
Expand Up @@ -450,6 +450,8 @@ void ColumnFamilyOptions::Dump(Logger* log) const {
}
ROCKS_LOG_HEADER(log, "Options.experimental_mempurge_threshold: %f",
experimental_mempurge_threshold);
ROCKS_LOG_HEADER(log, " Options.memtable_max_range_deletions: %d",
memtable_max_range_deletions);
} // ColumnFamilyOptions::Dump

void Options::Dump(Logger* log) const {
Expand Down
1 change: 1 addition & 0 deletions options/options_helper.cc
Original file line number Diff line number Diff line change
Expand Up @@ -270,6 +270,7 @@ void UpdateColumnFamilyOptions(const MutableCFOptions& moptions,
cf_opts->compression_per_level = moptions.compression_per_level;
cf_opts->last_level_temperature = moptions.last_level_temperature;
cf_opts->bottommost_temperature = moptions.last_level_temperature;
cf_opts->memtable_max_range_deletions = moptions.memtable_max_range_deletions;
}

void UpdateColumnFamilyOptions(const ImmutableCFOptions& ioptions,
Expand Down
3 changes: 2 additions & 1 deletion options/options_settable_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -556,7 +556,8 @@ TEST_F(OptionsSettableTest, ColumnFamilyOptionsAllFieldsSettable) {
"blob_cache=1M;"
"memtable_protection_bytes_per_key=2;"
"persist_user_defined_timestamps=true;"
"block_protection_bytes_per_key=1;",
"block_protection_bytes_per_key=1;"
"memtable_max_range_deletions=999999;",
new_options));

ASSERT_NE(new_options->blob_cache.get(), nullptr);
Expand Down
4 changes: 4 additions & 0 deletions options/options_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,7 @@ TEST_F(OptionsTest, GetOptionsFromMapTest) {
{"prepopulate_blob_cache", "kDisable"},
{"last_level_temperature", "kWarm"},
{"persist_user_defined_timestamps", "true"},
{"memtable_max_range_deletions", "0"},
};

std::unordered_map<std::string, std::string> db_options_map = {
Expand Down Expand Up @@ -284,6 +285,7 @@ TEST_F(OptionsTest, GetOptionsFromMapTest) {
ASSERT_EQ(new_cf_opt.last_level_temperature, Temperature::kWarm);
ASSERT_EQ(new_cf_opt.bottommost_temperature, Temperature::kWarm);
ASSERT_EQ(new_cf_opt.persist_user_defined_timestamps, true);
ASSERT_EQ(new_cf_opt.memtable_max_range_deletions, 0);

cf_options_map["write_buffer_size"] = "hello";
ASSERT_NOK(GetColumnFamilyOptionsFromMap(exact, base_cf_opt, cf_options_map,
Expand Down Expand Up @@ -2338,6 +2340,7 @@ TEST_F(OptionsOldApiTest, GetOptionsFromMapTest) {
{"prepopulate_blob_cache", "kDisable"},
{"last_level_temperature", "kWarm"},
{"persist_user_defined_timestamps", "true"},
{"memtable_max_range_deletions", "0"},
};

std::unordered_map<std::string, std::string> db_options_map = {
Expand Down Expand Up @@ -2489,6 +2492,7 @@ TEST_F(OptionsOldApiTest, GetOptionsFromMapTest) {
ASSERT_EQ(new_cf_opt.last_level_temperature, Temperature::kWarm);
ASSERT_EQ(new_cf_opt.bottommost_temperature, Temperature::kWarm);
ASSERT_EQ(new_cf_opt.persist_user_defined_timestamps, true);
ASSERT_EQ(new_cf_opt.memtable_max_range_deletions, 0);

cf_options_map["write_buffer_size"] = "hello";
ASSERT_NOK(GetColumnFamilyOptionsFromMap(cf_config_options, base_cf_opt,
Expand Down

0 comments on commit a1e333c

Please sign in to comment.