Skip to content

Commit

Permalink
Make BlockBasedTable::kMaxAutoReadAheadSize configurable (#7951)
Browse files Browse the repository at this point in the history
Summary:
RocksDB does auto-readahead for iterators on noticing more
than two reads for a table file. The readahead starts at 8KB and doubles on every
additional read upto BlockBasedTable::kMaxAutoReadAheadSize which is
256*1024.
This PR adds a new option BlockBasedTableOptions::max_auto_readahead_size which
replaces BlockBasedTable::kMaxAutoReadAheadSize and the new option can be
configured.
If max_auto_readahead_size is set 0 then no implicit auto prefetching will
be done. If max_auto_readahead_size provided is less than
8KB (which is initial readahead size used by rocksdb in case of
auto-readahead), readahead size will remain same as max_auto_readahead_size.

Pull Request resolved: #7951

Test Plan: Add new unit test case.

Reviewed By: anand1976

Differential Revision: D26568085

Pulled By: akankshamahajan15

fbshipit-source-id: b6543520fc74e97d859f2002328d4c5254d417af
  • Loading branch information
akankshamahajan15 authored and facebook-github-bot committed Feb 24, 2021
1 parent e017af1 commit cd79a00
Show file tree
Hide file tree
Showing 8 changed files with 210 additions and 19 deletions.
2 changes: 2 additions & 0 deletions HISTORY.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
# Rocksdb Change Log
## Unreleased
### Public API change
* Add a new option BlockBasedTableOptions::max_auto_readahead_size. RocksDB does auto-readahead for iterators on noticing more than two reads for a table file if user doesn't provide readahead_size. The readahead starts at 8KB and doubles on every additional read upto max_auto_readahead_size and now max_auto_readahead_size can be configured dynamically as well. Found that 256 KB readahead size provides the best performance, based on experiments, for auto readahead. Experiment data is in PR #3282. If value is set 0 then no automatic prefetching will be done by rocksdb. Also changing the value will only affect files opened after the change.

## 6.18.0 (02/19/2021)
### Behavior Changes
Expand Down
145 changes: 145 additions & 0 deletions file/prefetch_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -175,9 +175,154 @@ TEST_P(PrefetchTest, Basic) {
Close();
}

#ifndef ROCKSDB_LITE
TEST_P(PrefetchTest, ConfigureAutoMaxReadaheadSize) {
// First param is if the mockFS support_prefetch or not
bool support_prefetch =
std::get<0>(GetParam()) &&
test::IsPrefetchSupported(env_->GetFileSystem(), dbname_);

// Second param is if directIO is enabled or not
bool use_direct_io = std::get<1>(GetParam());

std::shared_ptr<MockFS> fs =
std::make_shared<MockFS>(env_->GetFileSystem(), support_prefetch);
std::unique_ptr<Env> env(new CompositeEnvWrapper(env_, fs));

Options options = CurrentOptions();
options.write_buffer_size = 1024;
options.create_if_missing = true;
options.compression = kNoCompression;
options.env = env.get();
options.disable_auto_compactions = true;
if (use_direct_io) {
options.use_direct_reads = true;
options.use_direct_io_for_flush_and_compaction = true;
}
BlockBasedTableOptions table_options;
table_options.no_block_cache = true;
table_options.cache_index_and_filter_blocks = false;
table_options.metadata_block_size = 1024;
table_options.index_type =
BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch;
table_options.max_auto_readahead_size = 0;
options.table_factory.reset(NewBlockBasedTableFactory(table_options));

int buff_prefetch_count = 0;
SyncPoint::GetInstance()->SetCallBack("FilePrefetchBuffer::Prefetch:Start",
[&](void*) { buff_prefetch_count++; });

// DB open will create table readers unless we reduce the table cache
// capacity. SanitizeOptions will set max_open_files to minimum of 20. Table
// cache is allocated with max_open_files - 10 as capacity. So override
// max_open_files to 10 so table cache capacity will become 0. This will
// prevent file open during DB open and force the file to be opened during
// Iteration.
SyncPoint::GetInstance()->SetCallBack(
"SanitizeOptions::AfterChangeMaxOpenFiles", [&](void* arg) {
int* max_open_files = (int*)arg;
*max_open_files = 11;
});

SyncPoint::GetInstance()->EnableProcessing();

Status s = TryReopen(options);

if (use_direct_io && (s.IsNotSupported() || s.IsInvalidArgument())) {
// If direct IO is not supported, skip the test
return;
} else {
ASSERT_OK(s);
}

Random rnd(309);
int key_count = 0;
const int num_keys_per_level = 100;
// Level 0 : Keys in range [0, 99], Level 1:[100, 199], Level 2:[200, 299].
for (int level = 2; level >= 0; level--) {
key_count = level * num_keys_per_level;
for (int i = 0; i < num_keys_per_level; ++i) {
ASSERT_OK(Put(Key(key_count++), rnd.RandomString(500)));
}
ASSERT_OK(Flush());
MoveFilesToLevel(level);
}
Close();
std::vector<int> buff_prefectch_level_count = {0, 0, 0};
TryReopen(options);
{
auto iter = std::unique_ptr<Iterator>(db_->NewIterator(ReadOptions()));
fs->ClearPrefetchCount();
buff_prefetch_count = 0;

for (int level = 2; level >= 0; level--) {
key_count = level * num_keys_per_level;
switch (level) {
case 0:
// max_auto_readahead_size is set 0 so data and index blocks are not
// prefetched.
ASSERT_OK(db_->SetOptions(
{{"block_based_table_factory", "{max_auto_readahead_size=0;}"}}));
break;
case 1:
// max_auto_readahead_size is set less than
// BlockBasedTable::kInitAutoReadaheadSize. So readahead_size remains
// equal to max_auto_readahead_size.
ASSERT_OK(db_->SetOptions({{"block_based_table_factory",
"{max_auto_readahead_size=4096;}"}}));
break;
case 2:
ASSERT_OK(db_->SetOptions({{"block_based_table_factory",
"{max_auto_readahead_size=65536;}"}}));
break;
default:
assert(false);
}

for (int i = 0; i < num_keys_per_level; ++i) {
iter->Seek(Key(key_count++));
iter->Next();
}

buff_prefectch_level_count[level] = buff_prefetch_count;
if (support_prefetch && !use_direct_io) {
if (level == 0) {
ASSERT_FALSE(fs->IsPrefetchCalled());
} else {
ASSERT_TRUE(fs->IsPrefetchCalled());
}
fs->ClearPrefetchCount();
} else {
ASSERT_FALSE(fs->IsPrefetchCalled());
if (level == 0) {
ASSERT_EQ(buff_prefetch_count, 0);
} else {
ASSERT_GT(buff_prefetch_count, 0);
}
buff_prefetch_count = 0;
}
}
}

if (!support_prefetch) {
ASSERT_GT(buff_prefectch_level_count[1], buff_prefectch_level_count[2]);
}

SyncPoint::GetInstance()->DisableProcessing();
SyncPoint::GetInstance()->ClearAllCallBacks();
Close();
}

INSTANTIATE_TEST_CASE_P(PrefetchTest, PrefetchTest,
::testing::Combine(::testing::Bool(),
::testing::Bool()));
#endif // !ROCKSDB_LITE

class PrefetchTest1 : public DBTestBase,
public ::testing::WithParamInterface<bool> {
public:
PrefetchTest1() : DBTestBase("/prefetch_test1", true) {}
};

} // namespace ROCKSDB_NAMESPACE

Expand Down
27 changes: 27 additions & 0 deletions include/rocksdb/table.h
Original file line number Diff line number Diff line change
Expand Up @@ -435,6 +435,33 @@ struct BlockBasedTableOptions {

IndexShorteningMode index_shortening =
IndexShorteningMode::kShortenSeparators;

// RocksDB does auto-readahead for iterators on noticing more than two reads
// for a table file if user doesn't provide readahead_size. The readahead
// starts at 8KB and doubles on every additional read upto
// max_auto_readahead_size and max_auto_readahead_size can be configured.
//
// Special Value: 0 - If max_auto_readahead_size is set 0 then no implicit
// auto prefetching will be done. If max_auto_readahead_size provided is less
// than 8KB (which is initial readahead size used by rocksdb in case of
// auto-readahead), readahead size will remain same as
// max_auto_readahead_size.
//
// Value should be provided along with KB i.e. 256 * 1024 as it will prefetch
// the blocks.
//
// Found that 256 KB readahead size provides the best performance, based on
// experiments, for auto readahead. Experiment data is in PR #3282.
//
// This parameter can be changed dynamically by
// DB::SetOptions({{"block_based_table_factory",
// "{max_auto_readahead_size=0;}"}}));
//
// Changing the value dynamically will only affect files opened after the
// change.
//
// Default: 256 KB (256 * 1024).
size_t max_auto_readahead_size = 256 * 1024;
};

// Table Properties that are specific to block-based table properties.
Expand Down
3 changes: 2 additions & 1 deletion options/options_settable_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -179,7 +179,8 @@ TEST_F(OptionsSettableTest, BlockBasedTableOptionsAllFieldsSettable) {
"hash_index_allow_collision=false;"
"verify_compression=true;read_amp_bytes_per_bit=0;"
"enable_index_compression=false;"
"block_align=true",
"block_align=true;"
"max_auto_readahead_size=0",
new_bbto));

ASSERT_EQ(unset_bytes_base,
Expand Down
7 changes: 7 additions & 0 deletions table/block_based/block_based_table_factory.cc
Original file line number Diff line number Diff line change
Expand Up @@ -415,6 +415,10 @@ static std::unordered_map<std::string, OptionTypeInfo>
auto* cache = reinterpret_cast<std::shared_ptr<Cache>*>(addr);
return Cache::CreateFromString(opts, value, cache);
}}},
{"max_auto_readahead_size",
{offsetof(struct BlockBasedTableOptions, max_auto_readahead_size),
OptionType::kSizeT, OptionVerificationType::kNormal,
OptionTypeFlags::kMutable}},
#endif // ROCKSDB_LITE
};

Expand Down Expand Up @@ -687,6 +691,9 @@ std::string BlockBasedTableFactory::GetPrintableOptions() const {
snprintf(buffer, kBufferSize, " block_align: %d\n",
table_options_.block_align);
ret.append(buffer);
snprintf(buffer, kBufferSize,
" max_auto_readahead_size: %" ROCKSDB_PRIszt "\n",
table_options_.max_auto_readahead_size);
return ret;
}

Expand Down
7 changes: 1 addition & 6 deletions table/block_based/block_based_table_reader.cc
Original file line number Diff line number Diff line change
Expand Up @@ -67,11 +67,6 @@ extern const uint64_t kBlockBasedTableMagicNumber;
extern const std::string kHashIndexPrefixesBlock;
extern const std::string kHashIndexPrefixesMetadataBlock;


// Found that 256 KB readahead size provides the best performance, based on
// experiments, for auto readahead. Experiment data is in PR #3282.
const size_t BlockBasedTable::kMaxAutoReadaheadSize = 256 * 1024;

BlockBasedTable::~BlockBasedTable() {
delete rep_;
}
Expand Down Expand Up @@ -2921,7 +2916,7 @@ Status BlockBasedTable::VerifyChecksumInBlocks(
// increasing of the buffer size.
size_t readahead_size = (read_options.readahead_size != 0)
? read_options.readahead_size
: kMaxAutoReadaheadSize;
: rep_->table_options.max_auto_readahead_size;
// FilePrefetchBuffer doesn't work in mmap mode and readahead is not
// needed there.
FilePrefetchBuffer prefetch_buffer(
Expand Down
3 changes: 0 additions & 3 deletions table/block_based/block_based_table_reader.h
Original file line number Diff line number Diff line change
Expand Up @@ -65,9 +65,6 @@ class BlockBasedTable : public TableReader {

// All the below fields control iterator readahead
static const size_t kInitAutoReadaheadSize = 8 * 1024;
// Found that 256 KB readahead size provides the best performance, based on
// experiments, for auto readahead. Experiment data is in PR #3282.
static const size_t kMaxAutoReadaheadSize;
static const int kMinNumFileReadsToStartAutoReadahead = 2;

// Attempt to open the table that is stored in bytes [0..file_size)
Expand Down
35 changes: 26 additions & 9 deletions table/block_based/block_prefetcher.cc
Original file line number Diff line number Diff line change
Expand Up @@ -35,10 +35,23 @@ void BlockPrefetcher::PrefetchIfNeeded(const BlockBasedTable::Rep* rep,
return;
}

size_t max_auto_readahead_size = rep->table_options.max_auto_readahead_size;
size_t initial_auto_readahead_size = BlockBasedTable::kInitAutoReadaheadSize;

// If max_auto_readahead_size is set to be 0 by user, no data will be
// prefetched.
if (max_auto_readahead_size == 0) {
return;
}

if (initial_auto_readahead_size > max_auto_readahead_size) {
initial_auto_readahead_size = max_auto_readahead_size;
}

if (rep->file->use_direct_io()) {
rep->CreateFilePrefetchBufferIfNotExists(
BlockBasedTable::kInitAutoReadaheadSize,
BlockBasedTable::kMaxAutoReadaheadSize, &prefetch_buffer_);
rep->CreateFilePrefetchBufferIfNotExists(initial_auto_readahead_size,
max_auto_readahead_size,
&prefetch_buffer_);
return;
}

Expand All @@ -47,20 +60,24 @@ void BlockPrefetcher::PrefetchIfNeeded(const BlockBasedTable::Rep* rep,
return;
}

if (readahead_size_ > max_auto_readahead_size) {
readahead_size_ = max_auto_readahead_size;
}

// If prefetch is not supported, fall back to use internal prefetch buffer.
// Discarding other return status of Prefetch calls intentionally, as
// we can fallback to reading from disk if Prefetch fails.
Status s = rep->file->Prefetch(handle.offset(), readahead_size_);
if (s.IsNotSupported()) {
rep->CreateFilePrefetchBufferIfNotExists(
BlockBasedTable::kInitAutoReadaheadSize,
BlockBasedTable::kMaxAutoReadaheadSize, &prefetch_buffer_);
rep->CreateFilePrefetchBufferIfNotExists(initial_auto_readahead_size,
max_auto_readahead_size,
&prefetch_buffer_);
return;
}
readahead_limit_ = static_cast<size_t>(handle.offset() + readahead_size_);

// Keep exponentially increasing readahead size until
// kMaxAutoReadaheadSize.
readahead_size_ =
std::min(BlockBasedTable::kMaxAutoReadaheadSize, readahead_size_ * 2);
// max_auto_readahead_size.
readahead_size_ = std::min(max_auto_readahead_size, readahead_size_ * 2);
}
} // namespace ROCKSDB_NAMESPACE

0 comments on commit cd79a00

Please sign in to comment.