Permalink
Browse files

Materialize the hash index

Summary:
Materialize the hash index to avoid the soaring cpu/flash usage
when initializing the database.

Test Plan: existing unit tests passed

Reviewers: sdong, haobo

Reviewed By: sdong

CC: leveldb

Differential Revision: https://reviews.facebook.net/D18339
  • Loading branch information...
Kai Liu
Kai Liu committed May 15, 2014
1 parent 4e0602f commit 0b3d03d026a7248e438341264b4c6df339edc1d7
View
@@ -5,6 +5,9 @@
### Public API changes
* Replaced ColumnFamilyOptions::table_properties_collectors with ColumnFamilyOptions::table_properties_collector_factories
+### New Features
+* Hash index for block-based table will be materialized and reconstructed more efficiently. Previously hash index is constructed by scanning the whole table during every table open.
+
## 3.0.0 (05/05/2014)
### Public API changes
@@ -15,6 +15,8 @@
#include <map>
#include <memory>
+#include <string>
+#include <unordered_map>
#include "db/dbformat.h"
@@ -41,6 +43,8 @@
namespace rocksdb {
+extern const std::string kHashIndexPrefixesBlock;
+extern const std::string kHashIndexPrefixesMetadataBlock;
namespace {
typedef BlockBasedTableOptions::IndexType IndexType;
@@ -57,6 +61,14 @@ typedef BlockBasedTableOptions::IndexType IndexType;
// design that just works.
class IndexBuilder {
public:
+ // Index builder will construct a set of blocks which contain:
+ // 1. One primary index block.
+ // 2. (Optional) a set of metablocks that contains the metadata of the
+ // primary index.
+ struct IndexBlocks {
+ Slice index_block_contents;
+ std::unordered_map<std::string, Slice> meta_blocks;
+ };
explicit IndexBuilder(const Comparator* comparator)
: comparator_(comparator) {}
@@ -72,15 +84,19 @@ class IndexBuilder {
// the last one in the table
//
// REQUIRES: Finish() has not yet been called.
- virtual void AddEntry(std::string* last_key_in_current_block,
- const Slice* first_key_in_next_block,
- const BlockHandle& block_handle) = 0;
+ virtual void AddIndexEntry(std::string* last_key_in_current_block,
+ const Slice* first_key_in_next_block,
+ const BlockHandle& block_handle) = 0;
+
+ // This method will be called whenever a key is added. The subclasses may
+ // override OnKeyAdded() if they need to collect additional information.
+ virtual void OnKeyAdded(const Slice& key) {}
// Inform the index builder that all entries has been written. Block builder
// may therefore perform any operation required for block finalization.
//
// REQUIRES: Finish() has not yet been called.
- virtual Slice Finish() = 0;
+ virtual Status Finish(IndexBlocks* index_blocks) = 0;
// Get the estimated size for index block.
virtual size_t EstimatedSize() const = 0;
@@ -103,9 +119,9 @@ class ShortenedIndexBuilder : public IndexBuilder {
: IndexBuilder(comparator),
index_block_builder_(1 /* block_restart_interval == 1 */, comparator) {}
- virtual void AddEntry(std::string* last_key_in_current_block,
- const Slice* first_key_in_next_block,
- const BlockHandle& block_handle) override {
+ virtual void AddIndexEntry(std::string* last_key_in_current_block,
+ const Slice* first_key_in_next_block,
+ const BlockHandle& block_handle) override {
if (first_key_in_next_block != nullptr) {
comparator_->FindShortestSeparator(last_key_in_current_block,
*first_key_in_next_block);
@@ -118,7 +134,10 @@ class ShortenedIndexBuilder : public IndexBuilder {
index_block_builder_.Add(*last_key_in_current_block, handle_encoding);
}
- virtual Slice Finish() override { return index_block_builder_.Finish(); }
+ virtual Status Finish(IndexBlocks* index_blocks) {
+ index_blocks->index_block_contents = index_block_builder_.Finish();
+ return Status::OK();
+ }
virtual size_t EstimatedSize() const {
return index_block_builder_.CurrentSizeEstimate();
@@ -128,38 +147,125 @@ class ShortenedIndexBuilder : public IndexBuilder {
BlockBuilder index_block_builder_;
};
-// FullKeyIndexBuilder is also based on BlockBuilder. It works pretty much like
-// ShortenedIndexBuilder, but preserves the full key instead the substitude key.
-class FullKeyIndexBuilder : public IndexBuilder {
+// HashIndexBuilder contains a binary-searchable primary index and the
+// metadata for secondary hash index construction.
+// The metadata for hash index consists two parts:
+// - a metablock that compactly contains a sequence of prefixes. All prefixes
+// are stored consectively without any metadata (like, prefix sizes) being
+// stored, which is kept in the other metablock.
+// - a metablock contains the metadata of the prefixes, including prefix size,
+// restart index and number of block it spans. The format looks like:
+//
+// +-----------------+---------------------------+---------------------+ <=prefix 1
+// | length: 4 bytes | restart interval: 4 bytes | num-blocks: 4 bytes |
+// +-----------------+---------------------------+---------------------+ <=prefix 2
+// | length: 4 bytes | restart interval: 4 bytes | num-blocks: 4 bytes |
+// +-----------------+---------------------------+---------------------+
+// | |
+// | .... |
+// | |
+// +-----------------+---------------------------+---------------------+ <=prefix n
+// | length: 4 bytes | restart interval: 4 bytes | num-blocks: 4 bytes |
+// +-----------------+---------------------------+---------------------+
+//
+// The reason of separating these two metablocks is to enable the efficiently
+// reuse the first metablock during hash index construction without unnecessary
+// data copy or small heap allocations for prefixes.
+class HashIndexBuilder : public IndexBuilder {
public:
- explicit FullKeyIndexBuilder(const Comparator* comparator)
+ explicit HashIndexBuilder(const Comparator* comparator,
+ const SliceTransform* hash_key_extractor)
: IndexBuilder(comparator),
- index_block_builder_(1 /* block_restart_interval == 1 */, comparator) {}
+ primary_index_builder(comparator),
+ hash_key_extractor_(hash_key_extractor) {}
+
+ virtual void AddIndexEntry(std::string* last_key_in_current_block,
+ const Slice* first_key_in_next_block,
+ const BlockHandle& block_handle) override {
+ ++current_restart_index_;
+ primary_index_builder.AddIndexEntry(last_key_in_current_block,
+ first_key_in_next_block, block_handle);
+ }
- virtual void AddEntry(std::string* last_key_in_current_block,
- const Slice* first_key_in_next_block,
- const BlockHandle& block_handle) override {
- std::string handle_encoding;
- block_handle.EncodeTo(&handle_encoding);
- index_block_builder_.Add(*last_key_in_current_block, handle_encoding);
+ virtual void OnKeyAdded(const Slice& key) override {
+ auto key_prefix = hash_key_extractor_->Transform(key);
+ bool is_first_entry = pending_block_num_ == 0;
+
+ // Keys may share the prefix
+ if (is_first_entry || pending_entry_prefix_ != key_prefix) {
+ if (!is_first_entry) {
+ FlushPendingPrefix();
+ }
+
+ // need a hard copy otherwise the underlying data changes all the time.
+ // TODO(kailiu) ToString() is expensive. We may speed up can avoid data
+ // copy.
+ pending_entry_prefix_ = key_prefix.ToString();
+ pending_block_num_ = 1;
+ pending_entry_index_ = current_restart_index_;
+ } else {
+ // entry number increments when keys share the prefix reside in
+ // differnt data blocks.
+ auto last_restart_index = pending_entry_index_ + pending_block_num_ - 1;
+ assert(last_restart_index <= current_restart_index_);
+ if (last_restart_index != current_restart_index_) {
+ ++pending_block_num_;
+ }
+ }
}
- virtual Slice Finish() override { return index_block_builder_.Finish(); }
+ virtual Status Finish(IndexBlocks* index_blocks) {
+ FlushPendingPrefix();
+ primary_index_builder.Finish(index_blocks);
+ index_blocks->meta_blocks.insert(
+ {kHashIndexPrefixesBlock.c_str(), prefix_block_});
+ index_blocks->meta_blocks.insert(
+ {kHashIndexPrefixesMetadataBlock.c_str(), prefix_meta_block_});
+ return Status::OK();
+ }
virtual size_t EstimatedSize() const {
- return index_block_builder_.CurrentSizeEstimate();
+ return primary_index_builder.EstimatedSize() + prefix_block_.size() +
+ prefix_meta_block_.size();
}
private:
- BlockBuilder index_block_builder_;
+ void FlushPendingPrefix() {
+ prefix_block_.append(pending_entry_prefix_.data(),
+ pending_entry_prefix_.size());
+ PutVarint32(&prefix_meta_block_, pending_entry_prefix_.size());
+ PutVarint32(&prefix_meta_block_, pending_entry_index_);
+ PutVarint32(&prefix_meta_block_, pending_block_num_);
+ }
+
+ ShortenedIndexBuilder primary_index_builder;
+ const SliceTransform* hash_key_extractor_;
+
+ // stores a sequence of prefixes
+ std::string prefix_block_;
+ // stores the metadata of prefixes
+ std::string prefix_meta_block_;
+
+ // The following 3 variables keeps unflushed prefix and its metadata.
+ // The details of block_num and entry_index can be found in
+ // "block_hash_index.{h,cc}"
+ uint32_t pending_block_num_ = 0;
+ uint32_t pending_entry_index_ = 0;
+ std::string pending_entry_prefix_;
+
+ uint64_t current_restart_index_ = 0;
};
// Create a index builder based on its type.
-IndexBuilder* CreateIndexBuilder(IndexType type, const Comparator* comparator) {
+IndexBuilder* CreateIndexBuilder(IndexType type, const Comparator* comparator,
+ const SliceTransform* prefix_extractor) {
switch (type) {
case BlockBasedTableOptions::kBinarySearch: {
return new ShortenedIndexBuilder(comparator);
}
+ case BlockBasedTableOptions::kHashSearch: {
+ return new HashIndexBuilder(comparator, prefix_extractor);
+ }
default: {
assert(!"Do not recognize the index type ");
return nullptr;
@@ -249,7 +355,7 @@ extern const uint64_t kLegacyBlockBasedTableMagicNumber = 0xdb4775248b80fb57ull;
class BlockBasedTableBuilder::BlockBasedTablePropertiesCollector
: public TablePropertiesCollector {
public:
- BlockBasedTablePropertiesCollector(
+ explicit BlockBasedTablePropertiesCollector(
BlockBasedTableOptions::IndexType index_type)
: index_type_(index_type) {}
@@ -288,6 +394,8 @@ struct BlockBasedTableBuilder::Rep {
uint64_t offset = 0;
Status status;
BlockBuilder data_block;
+
+ InternalKeySliceTransform internal_prefix_transform;
std::unique_ptr<IndexBuilder> index_builder;
std::string last_key;
@@ -316,8 +424,9 @@ struct BlockBasedTableBuilder::Rep {
internal_comparator(icomparator),
file(f),
data_block(options, &internal_comparator),
- index_builder(
- CreateIndexBuilder(index_block_type, &internal_comparator)),
+ internal_prefix_transform(options.prefix_extractor.get()),
+ index_builder(CreateIndexBuilder(index_block_type, &internal_comparator,
+ &this->internal_prefix_transform)),
compression_type(compression_type),
checksum_type(checksum_type),
filter_block(opt.filter_policy == nullptr
@@ -335,16 +444,13 @@ struct BlockBasedTableBuilder::Rep {
}
};
-// TODO(sdong): Currently only write out binary search index. In
-// BlockBasedTableReader, Hash index will be built using binary search index.
BlockBasedTableBuilder::BlockBasedTableBuilder(
const Options& options, const BlockBasedTableOptions& table_options,
const InternalKeyComparator& internal_comparator, WritableFile* file,
CompressionType compression_type)
: rep_(new Rep(options, internal_comparator, file,
table_options.flush_block_policy_factory.get(),
- compression_type,
- BlockBasedTableOptions::IndexType::kBinarySearch,
+ compression_type, table_options.index_type,
table_options.checksum)) {
if (rep_->filter_block != nullptr) {
rep_->filter_block->StartBlock(0);
@@ -370,7 +476,7 @@ void BlockBasedTableBuilder::Add(const Slice& key, const Slice& value) {
if (r->props.num_entries > 0) {
assert(r->internal_comparator.Compare(key, Slice(r->last_key)) > 0);
}
-
+ r->index_builder->OnKeyAdded(key);
auto should_flush = r->flush_block_policy->Update(key, value);
if (should_flush) {
assert(!r->data_block.empty());
@@ -385,7 +491,7 @@ void BlockBasedTableBuilder::Add(const Slice& key, const Slice& value) {
// entries in the first block and < all entries in subsequent
// blocks.
if (ok()) {
- r->index_builder->AddEntry(&r->last_key, &key, r->pending_handle);
+ r->index_builder->AddIndexEntry(&r->last_key, &key, r->pending_handle);
}
}
@@ -561,24 +667,36 @@ Status BlockBasedTableBuilder::Finish() {
// block, we will finish writing all index entries here and flush them
// to storage after metaindex block is written.
if (ok() && !empty_data_block) {
- r->index_builder->AddEntry(&r->last_key, nullptr /* no next data block */,
- r->pending_handle);
+ r->index_builder->AddIndexEntry(
+ &r->last_key, nullptr /* no next data block */, r->pending_handle);
+ }
+
+ IndexBuilder::IndexBlocks index_blocks;
+ auto s = r->index_builder->Finish(&index_blocks);
+ if (!s.ok()) {
+ return s;
}
// Write meta blocks and metaindex block with the following order.
// 1. [meta block: filter]
- // 2. [meta block: properties]
- // 3. [metaindex block]
- if (ok()) {
- MetaIndexBuilder meta_index_builer;
+ // 2. [other meta blocks]
+ // 3. [meta block: properties]
+ // 4. [metaindex block]
+ // write meta blocks
+ MetaIndexBuilder meta_index_builder;
+ for (const auto& item : index_blocks.meta_blocks) {
+ BlockHandle block_handle;
+ WriteBlock(item.second, &block_handle);
+ meta_index_builder.Add(item.first, block_handle);
+ }
- // Write filter block.
+ if (ok()) {
if (r->filter_block != nullptr) {
// Add mapping from "<filter_block_prefix>.Name" to location
// of filter data.
std::string key = BlockBasedTable::kFilterBlockPrefix;
key.append(r->options.filter_policy->Name());
- meta_index_builer.Add(key, filter_block_handle);
+ meta_index_builder.Add(key, filter_block_handle);
}
// Write properties block.
@@ -605,20 +723,16 @@ Status BlockBasedTableBuilder::Finish() {
&properties_block_handle
);
- meta_index_builer.Add(kPropertiesBlock,
- properties_block_handle);
+ meta_index_builder.Add(kPropertiesBlock, properties_block_handle);
} // end of properties block writing
-
- WriteRawBlock(
- meta_index_builer.Finish(),
- kNoCompression,
- &metaindex_block_handle
- );
- } // meta blocks and metaindex block.
+ } // meta blocks
// Write index block
if (ok()) {
- WriteBlock(r->index_builder->Finish(), &index_block_handle);
+ // flush the meta index block
+ WriteRawBlock(meta_index_builder.Finish(), kNoCompression,
+ &metaindex_block_handle);
+ WriteBlock(index_blocks.index_block_contents, &index_block_handle);
}
// Write footer
@@ -685,7 +799,6 @@ uint64_t BlockBasedTableBuilder::FileSize() const {
return rep_->offset;
}
-const std::string BlockBasedTable::kFilterBlockPrefix =
- "filter.";
+const std::string BlockBasedTable::kFilterBlockPrefix = "filter.";
} // namespace rocksdb
@@ -56,5 +56,8 @@ TableFactory* NewBlockBasedTableFactory(
const std::string BlockBasedTablePropertyNames::kIndexType =
"rocksdb.block.based.table.index.type";
+const std::string kHashIndexPrefixesBlock = "rocksdb.hashindex.prefixes";
+const std::string kHashIndexPrefixesMetadataBlock =
+ "rocksdb.hashindex.metadata";
} // namespace rocksdb
Oops, something went wrong.

0 comments on commit 0b3d03d

Please sign in to comment.