Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
Change OptimizeForPointLookup() and OptimizeForSmallDb() (#5165)
Summary:
Change the behavior of OptimizeForSmallDb() so that it is less likely to go out of memory.
Change the behavior of OptimizeForPointLookup() to take advantage of the new memtable whole key filter, and move away from prefix extractor as well as hash-based indexing, as they are prone to misuse.
Pull Request resolved: #5165

Differential Revision: D14880709

Pulled By: siying

fbshipit-source-id: 9af30e3c9e151eceea6d6b38701a58f1f9fb692d
  • Loading branch information
siying authored and facebook-github-bot committed Apr 11, 2019
1 parent d3d20dc commit ed9f5e2
Show file tree
Hide file tree
Showing 5 changed files with 73 additions and 10 deletions.
2 changes: 2 additions & 0 deletions HISTORY.md
Expand Up @@ -6,6 +6,8 @@
* Introduce Periodic Compaction for Level style compaction. Files are re-compacted periodically and put in the same level.

### Public API Change
* Change the behavior of OptimizeForPointLookup(): move away from hash-based block-based-table index, and use whole key memtable filtering.
* Change the behavior of OptimizeForSmallDb(): use a 16MB block cache, put index and filter blocks into it, and cost the memtable size to it. DBOptions.OptimizeForSmallDb() and ColumnFamilyOptions.OptimizeForSmallDb() start to take an optional cache object.
### Bug Fixes
* Fix a bug in 2PC where a sequence of txn prepare, memtable flush, and crash could result in losing the prepared transaction.
* Fix a bug in Encryption Env which could cause encrypted files to be read beyond file boundaries.
Expand Down
35 changes: 35 additions & 0 deletions db/db_test2.cc
Expand Up @@ -2396,6 +2396,41 @@ TEST_F(DBTest2, OptimizeForPointLookup) {
ASSERT_EQ("v1", Get("foo"));
}

TEST_F(DBTest2, OptimizeForSmallDB) {
Options options = CurrentOptions();
Close();
options.OptimizeForSmallDb();

// Find the cache object
ASSERT_EQ(std::string(BlockBasedTableFactory::kName),
std::string(options.table_factory->Name()));
BlockBasedTableOptions* table_options =
reinterpret_cast<BlockBasedTableOptions*>(
options.table_factory->GetOptions());
ASSERT_TRUE(table_options != nullptr);
std::shared_ptr<Cache> cache = table_options->block_cache;

ASSERT_EQ(0, cache->GetUsage());
ASSERT_OK(DB::Open(options, dbname_, &db_));
ASSERT_OK(Put("foo", "v1"));

// memtable size is costed to the block cache
ASSERT_NE(0, cache->GetUsage());

ASSERT_EQ("v1", Get("foo"));
Flush();

size_t prev_size = cache->GetUsage();
// Remember block cache size, so that we can find that
// it is filled after Get().
// Use pinnable slice so that it can ping the block so that
// when we check the size it is not evicted.
PinnableSlice value;
ASSERT_OK(db_->Get(ReadOptions(), db_->DefaultColumnFamily(), "foo", &value));
ASSERT_GT(cache->GetUsage(), prev_size);
value.Reset();
}

#endif // ROCKSDB_LITE

TEST_F(DBTest2, GetRaceFlush1) {
Expand Down
5 changes: 3 additions & 2 deletions env/env_encryption.cc
Expand Up @@ -6,9 +6,9 @@
#ifndef ROCKSDB_LITE

#include <algorithm>
#include <cassert>
#include <cctype>
#include <iostream>
#include <cassert>

#include "rocksdb/env_encryption.h"
#include "util/aligned_buffer.h"
Expand Down Expand Up @@ -897,7 +897,8 @@ Status CTREncryptionProvider::CreateCipherStream(
// very large chunk of the file (and very likely read over the bounds)
assert(prefix.size() >= 2 * blockSize);
if (prefix.size() < 2 * blockSize) {
return Status::Corruption("Unable to read from file " + fname + ": read attempt would read beyond file bounds");
return Status::Corruption("Unable to read from file " + fname +
": read attempt would read beyond file bounds");
}

// Decrypt the encrypted part of the prefix, starting from block 2 (block 0, 1 with initial counter & IV are unencrypted)
Expand Down
8 changes: 6 additions & 2 deletions include/rocksdb/options.h
Expand Up @@ -88,7 +88,9 @@ struct ColumnFamilyOptions : public AdvancedColumnFamilyOptions {
// Some functions that make it easier to optimize RocksDB
// Use this if your DB is very small (like under 1GB) and you don't want to
// spend lots of memory for memtables.
ColumnFamilyOptions* OptimizeForSmallDb();
// An optional cache object is passed in to be used as the block cache
ColumnFamilyOptions* OptimizeForSmallDb(
std::shared_ptr<Cache>* cache = nullptr);

// Use this if you don't need to keep the data sorted, i.e. you'll never use
// an iterator, only Put() and Get() API calls
Expand Down Expand Up @@ -349,7 +351,9 @@ struct DBOptions {

// Use this if your DB is very small (like under 1GB) and you don't want to
// spend lots of memory for memtables.
DBOptions* OptimizeForSmallDb();
// An optional cache object is passed in for the memory of the
// memtable to cost to
DBOptions* OptimizeForSmallDb(std::shared_ptr<Cache>* cache = nullptr);

#ifndef ROCKSDB_LITE
// By default, RocksDB uses only one background thread for flush and
Expand Down
33 changes: 27 additions & 6 deletions options/options.cc
Expand Up @@ -413,8 +413,11 @@ Options::PrepareForBulkLoad()
}

Options* Options::OptimizeForSmallDb() {
ColumnFamilyOptions::OptimizeForSmallDb();
DBOptions::OptimizeForSmallDb();
// 16MB block cache
std::shared_ptr<Cache> cache = NewLRUCache(16 << 20);

ColumnFamilyOptions::OptimizeForSmallDb(&cache);
DBOptions::OptimizeForSmallDb(&cache);
return this;
}

Expand Down Expand Up @@ -469,27 +472,44 @@ ColumnFamilyOptions* ColumnFamilyOptions::OldDefaults(
}

// Optimization functions
DBOptions* DBOptions::OptimizeForSmallDb() {
DBOptions* DBOptions::OptimizeForSmallDb(std::shared_ptr<Cache>* cache) {
max_file_opening_threads = 1;
max_open_files = 5000;

// Cost memtable to block cache too.
std::shared_ptr<rocksdb::WriteBufferManager> wbm =
std::make_shared<rocksdb::WriteBufferManager>(
0, (cache != nullptr) ? *cache : std::shared_ptr<Cache>());
write_buffer_manager = wbm;

return this;
}

ColumnFamilyOptions* ColumnFamilyOptions::OptimizeForSmallDb() {
ColumnFamilyOptions* ColumnFamilyOptions::OptimizeForSmallDb(
std::shared_ptr<Cache>* cache) {
write_buffer_size = 2 << 20;
target_file_size_base = 2 * 1048576;
max_bytes_for_level_base = 10 * 1048576;
soft_pending_compaction_bytes_limit = 256 * 1048576;
hard_pending_compaction_bytes_limit = 1073741824ul;

BlockBasedTableOptions table_options;
table_options.block_cache =
(cache != nullptr) ? *cache : std::shared_ptr<Cache>();
table_options.cache_index_and_filter_blocks = true;
// Two level iterator to avoid LRU cache imbalance
table_options.index_type =
BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch;
table_factory.reset(new BlockBasedTableFactory(table_options));


return this;
}

#ifndef ROCKSDB_LITE
ColumnFamilyOptions* ColumnFamilyOptions::OptimizeForPointLookup(
uint64_t block_cache_size_mb) {
prefix_extractor.reset(NewNoopTransform());
BlockBasedTableOptions block_based_options;
block_based_options.index_type = BlockBasedTableOptions::kHashSearch;
block_based_options.data_block_index_type =
BlockBasedTableOptions::kDataBlockBinaryAndHash;
block_based_options.data_block_hash_table_util_ratio = 0.75;
Expand All @@ -498,6 +518,7 @@ ColumnFamilyOptions* ColumnFamilyOptions::OptimizeForPointLookup(
NewLRUCache(static_cast<size_t>(block_cache_size_mb * 1024 * 1024));
table_factory.reset(new BlockBasedTableFactory(block_based_options));
memtable_prefix_bloom_size_ratio = 0.02;
memtable_whole_key_filtering = true;
return this;
}

Expand Down

0 comments on commit ed9f5e2

Please sign in to comment.