Skip to content
Permalink
Browse files

Materialize the hash index

Summary:
Materialize the hash index to avoid the soaring cpu/flash usage
when initializing the database.

Test Plan: existing unit tests passed

Reviewers: sdong, haobo

Reviewed By: sdong

CC: leveldb

Differential Revision: https://reviews.facebook.net/D18339
  • Loading branch information
Kai Liu
Kai Liu committed May 15, 2014
1 parent 4e0602f commit 0b3d03d026a7248e438341264b4c6df339edc1d7
@@ -5,6 +5,9 @@
### Public API changes
* Replaced ColumnFamilyOptions::table_properties_collectors with ColumnFamilyOptions::table_properties_collector_factories

### New Features
* Hash index for block-based table will be materialized and reconstructed more efficiently. Previously hash index is constructed by scanning the whole table during every table open.

## 3.0.0 (05/05/2014)

### Public API changes
@@ -15,6 +15,8 @@

#include <map>
#include <memory>
#include <string>
#include <unordered_map>

#include "db/dbformat.h"

@@ -41,6 +43,8 @@

namespace rocksdb {

extern const std::string kHashIndexPrefixesBlock;
extern const std::string kHashIndexPrefixesMetadataBlock;
namespace {

typedef BlockBasedTableOptions::IndexType IndexType;
@@ -57,6 +61,14 @@ typedef BlockBasedTableOptions::IndexType IndexType;
// design that just works.
class IndexBuilder {
public:
// Index builder will construct a set of blocks which contain:
// 1. One primary index block.
// 2. (Optional) a set of metablocks that contains the metadata of the
// primary index.
struct IndexBlocks {
Slice index_block_contents;
std::unordered_map<std::string, Slice> meta_blocks;
};
explicit IndexBuilder(const Comparator* comparator)
: comparator_(comparator) {}

@@ -72,15 +84,19 @@ class IndexBuilder {
// the last one in the table
//
// REQUIRES: Finish() has not yet been called.
virtual void AddEntry(std::string* last_key_in_current_block,
const Slice* first_key_in_next_block,
const BlockHandle& block_handle) = 0;
virtual void AddIndexEntry(std::string* last_key_in_current_block,
const Slice* first_key_in_next_block,
const BlockHandle& block_handle) = 0;

// This method will be called whenever a key is added. The subclasses may
// override OnKeyAdded() if they need to collect additional information.
virtual void OnKeyAdded(const Slice& key) {}

// Inform the index builder that all entries has been written. Block builder
// may therefore perform any operation required for block finalization.
//
// REQUIRES: Finish() has not yet been called.
virtual Slice Finish() = 0;
virtual Status Finish(IndexBlocks* index_blocks) = 0;

// Get the estimated size for index block.
virtual size_t EstimatedSize() const = 0;
@@ -103,9 +119,9 @@ class ShortenedIndexBuilder : public IndexBuilder {
: IndexBuilder(comparator),
index_block_builder_(1 /* block_restart_interval == 1 */, comparator) {}

virtual void AddEntry(std::string* last_key_in_current_block,
const Slice* first_key_in_next_block,
const BlockHandle& block_handle) override {
virtual void AddIndexEntry(std::string* last_key_in_current_block,
const Slice* first_key_in_next_block,
const BlockHandle& block_handle) override {
if (first_key_in_next_block != nullptr) {
comparator_->FindShortestSeparator(last_key_in_current_block,
*first_key_in_next_block);
@@ -118,7 +134,10 @@ class ShortenedIndexBuilder : public IndexBuilder {
index_block_builder_.Add(*last_key_in_current_block, handle_encoding);
}

virtual Slice Finish() override { return index_block_builder_.Finish(); }
virtual Status Finish(IndexBlocks* index_blocks) {
index_blocks->index_block_contents = index_block_builder_.Finish();
return Status::OK();
}

virtual size_t EstimatedSize() const {
return index_block_builder_.CurrentSizeEstimate();
@@ -128,38 +147,125 @@ class ShortenedIndexBuilder : public IndexBuilder {
BlockBuilder index_block_builder_;
};

// FullKeyIndexBuilder is also based on BlockBuilder. It works pretty much like
// ShortenedIndexBuilder, but preserves the full key instead the substitude key.
class FullKeyIndexBuilder : public IndexBuilder {
// HashIndexBuilder contains a binary-searchable primary index and the
// metadata for secondary hash index construction.
// The metadata for hash index consists two parts:
// - a metablock that compactly contains a sequence of prefixes. All prefixes
// are stored consectively without any metadata (like, prefix sizes) being
// stored, which is kept in the other metablock.
// - a metablock contains the metadata of the prefixes, including prefix size,
// restart index and number of block it spans. The format looks like:
//
// +-----------------+---------------------------+---------------------+ <=prefix 1
// | length: 4 bytes | restart interval: 4 bytes | num-blocks: 4 bytes |
// +-----------------+---------------------------+---------------------+ <=prefix 2
// | length: 4 bytes | restart interval: 4 bytes | num-blocks: 4 bytes |
// +-----------------+---------------------------+---------------------+
// | |
// | .... |
// | |
// +-----------------+---------------------------+---------------------+ <=prefix n
// | length: 4 bytes | restart interval: 4 bytes | num-blocks: 4 bytes |
// +-----------------+---------------------------+---------------------+
//
// The reason of separating these two metablocks is to enable the efficiently
// reuse the first metablock during hash index construction without unnecessary
// data copy or small heap allocations for prefixes.
class HashIndexBuilder : public IndexBuilder {
public:
explicit FullKeyIndexBuilder(const Comparator* comparator)
explicit HashIndexBuilder(const Comparator* comparator,
const SliceTransform* hash_key_extractor)
: IndexBuilder(comparator),
index_block_builder_(1 /* block_restart_interval == 1 */, comparator) {}
primary_index_builder(comparator),
hash_key_extractor_(hash_key_extractor) {}

virtual void AddIndexEntry(std::string* last_key_in_current_block,
const Slice* first_key_in_next_block,
const BlockHandle& block_handle) override {
++current_restart_index_;
primary_index_builder.AddIndexEntry(last_key_in_current_block,
first_key_in_next_block, block_handle);
}

virtual void AddEntry(std::string* last_key_in_current_block,
const Slice* first_key_in_next_block,
const BlockHandle& block_handle) override {
std::string handle_encoding;
block_handle.EncodeTo(&handle_encoding);
index_block_builder_.Add(*last_key_in_current_block, handle_encoding);
virtual void OnKeyAdded(const Slice& key) override {
auto key_prefix = hash_key_extractor_->Transform(key);
bool is_first_entry = pending_block_num_ == 0;

// Keys may share the prefix
if (is_first_entry || pending_entry_prefix_ != key_prefix) {
if (!is_first_entry) {
FlushPendingPrefix();
}

// need a hard copy otherwise the underlying data changes all the time.
// TODO(kailiu) ToString() is expensive. We may speed up can avoid data
// copy.
pending_entry_prefix_ = key_prefix.ToString();
pending_block_num_ = 1;
pending_entry_index_ = current_restart_index_;
} else {
// entry number increments when keys share the prefix reside in
// differnt data blocks.
auto last_restart_index = pending_entry_index_ + pending_block_num_ - 1;
assert(last_restart_index <= current_restart_index_);
if (last_restart_index != current_restart_index_) {
++pending_block_num_;
}
}
}

virtual Slice Finish() override { return index_block_builder_.Finish(); }
virtual Status Finish(IndexBlocks* index_blocks) {
FlushPendingPrefix();
primary_index_builder.Finish(index_blocks);
index_blocks->meta_blocks.insert(
{kHashIndexPrefixesBlock.c_str(), prefix_block_});
index_blocks->meta_blocks.insert(
{kHashIndexPrefixesMetadataBlock.c_str(), prefix_meta_block_});
return Status::OK();
}

virtual size_t EstimatedSize() const {
return index_block_builder_.CurrentSizeEstimate();
return primary_index_builder.EstimatedSize() + prefix_block_.size() +
prefix_meta_block_.size();
}

private:
BlockBuilder index_block_builder_;
void FlushPendingPrefix() {
prefix_block_.append(pending_entry_prefix_.data(),
pending_entry_prefix_.size());
PutVarint32(&prefix_meta_block_, pending_entry_prefix_.size());
PutVarint32(&prefix_meta_block_, pending_entry_index_);
PutVarint32(&prefix_meta_block_, pending_block_num_);
}

ShortenedIndexBuilder primary_index_builder;
const SliceTransform* hash_key_extractor_;

// stores a sequence of prefixes
std::string prefix_block_;
// stores the metadata of prefixes
std::string prefix_meta_block_;

// The following 3 variables keeps unflushed prefix and its metadata.
// The details of block_num and entry_index can be found in
// "block_hash_index.{h,cc}"
uint32_t pending_block_num_ = 0;
uint32_t pending_entry_index_ = 0;
std::string pending_entry_prefix_;

uint64_t current_restart_index_ = 0;
};

// Create a index builder based on its type.
IndexBuilder* CreateIndexBuilder(IndexType type, const Comparator* comparator) {
IndexBuilder* CreateIndexBuilder(IndexType type, const Comparator* comparator,
const SliceTransform* prefix_extractor) {
switch (type) {
case BlockBasedTableOptions::kBinarySearch: {
return new ShortenedIndexBuilder(comparator);
}
case BlockBasedTableOptions::kHashSearch: {
return new HashIndexBuilder(comparator, prefix_extractor);
}
default: {
assert(!"Do not recognize the index type ");
return nullptr;
@@ -249,7 +355,7 @@ extern const uint64_t kLegacyBlockBasedTableMagicNumber = 0xdb4775248b80fb57ull;
class BlockBasedTableBuilder::BlockBasedTablePropertiesCollector
: public TablePropertiesCollector {
public:
BlockBasedTablePropertiesCollector(
explicit BlockBasedTablePropertiesCollector(
BlockBasedTableOptions::IndexType index_type)
: index_type_(index_type) {}

@@ -288,6 +394,8 @@ struct BlockBasedTableBuilder::Rep {
uint64_t offset = 0;
Status status;
BlockBuilder data_block;

InternalKeySliceTransform internal_prefix_transform;
std::unique_ptr<IndexBuilder> index_builder;

std::string last_key;
@@ -316,8 +424,9 @@ struct BlockBasedTableBuilder::Rep {
internal_comparator(icomparator),
file(f),
data_block(options, &internal_comparator),
index_builder(
CreateIndexBuilder(index_block_type, &internal_comparator)),
internal_prefix_transform(options.prefix_extractor.get()),
index_builder(CreateIndexBuilder(index_block_type, &internal_comparator,
&this->internal_prefix_transform)),
compression_type(compression_type),
checksum_type(checksum_type),
filter_block(opt.filter_policy == nullptr
@@ -335,16 +444,13 @@ struct BlockBasedTableBuilder::Rep {
}
};

// TODO(sdong): Currently only write out binary search index. In
// BlockBasedTableReader, Hash index will be built using binary search index.
BlockBasedTableBuilder::BlockBasedTableBuilder(
const Options& options, const BlockBasedTableOptions& table_options,
const InternalKeyComparator& internal_comparator, WritableFile* file,
CompressionType compression_type)
: rep_(new Rep(options, internal_comparator, file,
table_options.flush_block_policy_factory.get(),
compression_type,
BlockBasedTableOptions::IndexType::kBinarySearch,
compression_type, table_options.index_type,
table_options.checksum)) {
if (rep_->filter_block != nullptr) {
rep_->filter_block->StartBlock(0);
@@ -370,7 +476,7 @@ void BlockBasedTableBuilder::Add(const Slice& key, const Slice& value) {
if (r->props.num_entries > 0) {
assert(r->internal_comparator.Compare(key, Slice(r->last_key)) > 0);
}

r->index_builder->OnKeyAdded(key);
auto should_flush = r->flush_block_policy->Update(key, value);
if (should_flush) {
assert(!r->data_block.empty());
@@ -385,7 +491,7 @@ void BlockBasedTableBuilder::Add(const Slice& key, const Slice& value) {
// entries in the first block and < all entries in subsequent
// blocks.
if (ok()) {
r->index_builder->AddEntry(&r->last_key, &key, r->pending_handle);
r->index_builder->AddIndexEntry(&r->last_key, &key, r->pending_handle);
}
}

@@ -561,24 +667,36 @@ Status BlockBasedTableBuilder::Finish() {
// block, we will finish writing all index entries here and flush them
// to storage after metaindex block is written.
if (ok() && !empty_data_block) {
r->index_builder->AddEntry(&r->last_key, nullptr /* no next data block */,
r->pending_handle);
r->index_builder->AddIndexEntry(
&r->last_key, nullptr /* no next data block */, r->pending_handle);
}

IndexBuilder::IndexBlocks index_blocks;
auto s = r->index_builder->Finish(&index_blocks);
if (!s.ok()) {
return s;
}

// Write meta blocks and metaindex block with the following order.
// 1. [meta block: filter]
// 2. [meta block: properties]
// 3. [metaindex block]
if (ok()) {
MetaIndexBuilder meta_index_builer;
// 2. [other meta blocks]
// 3. [meta block: properties]
// 4. [metaindex block]
// write meta blocks
MetaIndexBuilder meta_index_builder;
for (const auto& item : index_blocks.meta_blocks) {
BlockHandle block_handle;
WriteBlock(item.second, &block_handle);
meta_index_builder.Add(item.first, block_handle);
}

// Write filter block.
if (ok()) {
if (r->filter_block != nullptr) {
// Add mapping from "<filter_block_prefix>.Name" to location
// of filter data.
std::string key = BlockBasedTable::kFilterBlockPrefix;
key.append(r->options.filter_policy->Name());
meta_index_builer.Add(key, filter_block_handle);
meta_index_builder.Add(key, filter_block_handle);
}

// Write properties block.
@@ -605,20 +723,16 @@ Status BlockBasedTableBuilder::Finish() {
&properties_block_handle
);

meta_index_builer.Add(kPropertiesBlock,
properties_block_handle);
meta_index_builder.Add(kPropertiesBlock, properties_block_handle);
} // end of properties block writing

WriteRawBlock(
meta_index_builer.Finish(),
kNoCompression,
&metaindex_block_handle
);
} // meta blocks and metaindex block.
} // meta blocks

// Write index block
if (ok()) {
WriteBlock(r->index_builder->Finish(), &index_block_handle);
// flush the meta index block
WriteRawBlock(meta_index_builder.Finish(), kNoCompression,
&metaindex_block_handle);
WriteBlock(index_blocks.index_block_contents, &index_block_handle);
}

// Write footer
@@ -685,7 +799,6 @@ uint64_t BlockBasedTableBuilder::FileSize() const {
return rep_->offset;
}

const std::string BlockBasedTable::kFilterBlockPrefix =
"filter.";
const std::string BlockBasedTable::kFilterBlockPrefix = "filter.";

} // namespace rocksdb
@@ -56,5 +56,8 @@ TableFactory* NewBlockBasedTableFactory(

const std::string BlockBasedTablePropertyNames::kIndexType =
"rocksdb.block.based.table.index.type";
const std::string kHashIndexPrefixesBlock = "rocksdb.hashindex.prefixes";
const std::string kHashIndexPrefixesMetadataBlock =
"rocksdb.hashindex.metadata";

} // namespace rocksdb

0 comments on commit 0b3d03d

Please sign in to comment.
You can’t perform that action at this time.