Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix GetLiveFiles() returning OPTIONS-000000 #8268

Closed
wants to merge 5 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions HISTORY.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
* Fixed a bug where ingested files were written with incorrect boundary key metadata. In rare cases this could have led to a level's files being wrongly ordered and queries for the boundary keys returning wrong results.
* Fixed a data race between insertion into memtables and the retrieval of the DB properties `rocksdb.cur-size-active-mem-table`, `rocksdb.cur-size-all-mem-tables`, and `rocksdb.size-all-mem-tables`.
* Fixed the false-positive alert when recovering from the WAL file. Avoid reporting "SST file is ahead of WAL" on a newly created empty column family, if the previous WAL file is corrupted.
* Fixed a bug where `GetLiveFiles()` output included a non-existent file called "OPTIONS-000000". Backups and checkpoints, which use `GetLiveFiles()`, failed on DBs impacted by this bug. Read-write DBs were impacted when the latest OPTIONS file failed to write and `fail_if_options_file_error == false`. Read-only DBs were impacted when no OPTIONS files existed.

### Behavior Changes
* Due to the fix of false-postive alert of "SST file is ahead of WAL", all the CFs with no SST file (CF empty) will bypass the consistency check. We fixed a false-positive, but introduced a very rare true-negative which will be triggered in the following conditions: A CF with some delete operations in the last a few queries which will result in an empty CF (those are flushed to SST file and a compaction triggered which combines this file and all other SST files and generates an empty CF, or there is another reason to write a manifest entry for this CF after a flush that generates no SST file from an empty CF). The deletion entries are logged in a WAL and this WAL was corrupted, while the CF's log number points to the next WAL (due to the flush). Therefore, the DB can only recover to the point without these trailing deletions and cause the inconsistent DB status.
Expand Down
9 changes: 8 additions & 1 deletion db/db_filesnapshot.cc
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,14 @@ Status DBImpl::GetLiveFiles(std::vector<std::string>& ret,

ret.emplace_back(CurrentFileName(""));
ret.emplace_back(DescriptorFileName("", versions_->manifest_file_number()));
ret.emplace_back(OptionsFileName("", versions_->options_file_number()));
// The OPTIONS file number is zero in read-write mode when OPTIONS file
// writing failed and the DB was configured with
// `fail_if_options_file_error == false`. In read-only mode the OPTIONS file
// number is zero when no OPTIONS file exist at all. In those cases we do not
// record any OPTIONS file in the live file list.
if (versions_->options_file_number() != 0) {
ret.emplace_back(OptionsFileName("", versions_->options_file_number()));
}

// find length of manifest file while holding the mutex lock
*manifest_file_size = versions_->manifest_file_size();
Expand Down
1 change: 1 addition & 0 deletions db_stress_tool/db_stress_common.h
Original file line number Diff line number Diff line change
Expand Up @@ -258,6 +258,7 @@ DECLARE_bool(best_efforts_recovery);
DECLARE_bool(skip_verifydb);
DECLARE_bool(enable_compaction_filter);
DECLARE_bool(paranoid_file_checks);
DECLARE_bool(fail_if_options_file_error);
DECLARE_uint64(batch_protection_bytes_per_key);

DECLARE_uint64(user_timestamp_size);
Expand Down
4 changes: 4 additions & 0 deletions db_stress_tool/db_stress_gflags.cc
Original file line number Diff line number Diff line change
Expand Up @@ -792,6 +792,10 @@ DEFINE_bool(paranoid_file_checks, true,
"After writing every SST file, reopen it and read all the keys "
"and validate checksums");

DEFINE_bool(fail_if_options_file_error, false,
"Fail operations that fail to detect or properly persist options "
"file.");

DEFINE_uint64(batch_protection_bytes_per_key, 0,
"If nonzero, enables integrity protection in `WriteBatch` at the "
"specified number of bytes per key. Currently the only supported "
Expand Down
3 changes: 3 additions & 0 deletions db_stress_tool/db_stress_test_base.cc
Original file line number Diff line number Diff line change
Expand Up @@ -2110,6 +2110,8 @@ void StressTest::PrintEnv() const {
fprintf(stdout, "Sync fault injection : %d\n", FLAGS_sync_fault_injection);
fprintf(stdout, "Best efforts recovery : %d\n",
static_cast<int>(FLAGS_best_efforts_recovery));
fprintf(stdout, "Fail if OPTIONS file error: %d\n",
static_cast<int>(FLAGS_fail_if_options_file_error));
fprintf(stdout, "User timestamp size bytes : %d\n",
static_cast<int>(FLAGS_user_timestamp_size));

Expand Down Expand Up @@ -2328,6 +2330,7 @@ void StressTest::Open() {

options_.best_efforts_recovery = FLAGS_best_efforts_recovery;
options_.paranoid_file_checks = FLAGS_paranoid_file_checks;
options_.fail_if_options_file_error = FLAGS_fail_if_options_file_error;

if ((options_.enable_blob_files || options_.enable_blob_garbage_collection ||
FLAGS_allow_setting_blob_options_dynamically) &&
Expand Down
1 change: 1 addition & 0 deletions tools/db_crashtest.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@
"enable_pipelined_write": lambda: random.randint(0, 1),
"enable_compaction_filter": lambda: random.choice([0, 0, 0, 1]),
"expected_values_path": lambda: setup_expected_values_file(),
"fail_if_options_file_error": lambda: random.randint(0, 1),
"flush_one_in": 1000000,
"file_checksum_impl": lambda: random.choice(["none", "crc32c", "xxh64", "big"]),
"get_live_files_one_in": 1000000,
Expand Down
45 changes: 45 additions & 0 deletions utilities/checkpoint/checkpoint_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
#include "test_util/testharness.h"
#include "test_util/testutil.h"
#include "utilities/fault_injection_env.h"
#include "utilities/fault_injection_fs.h"

namespace ROCKSDB_NAMESPACE {
class CheckpointTest : public testing::Test {
Expand Down Expand Up @@ -793,6 +794,50 @@ TEST_F(CheckpointTest, CheckpointWithUnsyncedDataDropped) {
db_ = nullptr;
}

TEST_F(CheckpointTest, CheckpointOptionsFileFailedToPersist) {
// Regression test for a bug where checkpoint failed on a DB where persisting
// OPTIONS file failed and the DB was opened with
// `fail_if_options_file_error == false`.
Options options = CurrentOptions();
options.fail_if_options_file_error = false;
auto fault_fs = std::make_shared<FaultInjectionTestFS>(FileSystem::Default());

// Setup `FaultInjectionTestFS` and `SyncPoint` callbacks to fail one
// operation when inside the OPTIONS file persisting code.
std::unique_ptr<Env> fault_fs_env(NewCompositeEnv(fault_fs));
fault_fs->SetRandomMetadataWriteError(1 /* one_in */);
SyncPoint::GetInstance()->SetCallBack(
"PersistRocksDBOptions:start", [fault_fs](void* /* arg */) {
fault_fs->EnableMetadataWriteErrorInjection();
});
SyncPoint::GetInstance()->SetCallBack(
"FaultInjectionTestFS::InjectMetadataWriteError:Injected",
[fault_fs](void* /* arg */) {
fault_fs->DisableMetadataWriteErrorInjection();
});
options.env = fault_fs_env.get();
SyncPoint::GetInstance()->EnableProcessing();

Reopen(options);
ASSERT_OK(Put("key1", "val1"));
Checkpoint* checkpoint;
ASSERT_OK(Checkpoint::Create(db_, &checkpoint));
ASSERT_OK(checkpoint->CreateCheckpoint(snapshot_name_));
delete checkpoint;

// Make sure it's usable.
options.env = env_;
DB* snapshot_db;
ASSERT_OK(DB::Open(options, snapshot_name_, &snapshot_db));
ReadOptions read_opts;
std::string get_result;
ASSERT_OK(snapshot_db->Get(read_opts, "key1", &get_result));
ASSERT_EQ("val1", get_result);
delete snapshot_db;
delete db_;
db_ = nullptr;
}

TEST_F(CheckpointTest, CheckpointReadOnlyDB) {
ASSERT_OK(Put("foo", "foo_value"));
ASSERT_OK(Flush());
Expand Down
14 changes: 9 additions & 5 deletions utilities/fault_injection_fs.cc
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
#include "env/composite_env_wrapper.h"
#include "port/lang.h"
#include "port/stack_trace.h"
#include "test_util/sync_point.h"
#include "util/coding.h"
#include "util/crc32c.h"
#include "util/random.h"
Expand Down Expand Up @@ -708,12 +709,15 @@ IOStatus FaultInjectionTestFS::InjectWriteError(const std::string& file_name) {
}

IOStatus FaultInjectionTestFS::InjectMetadataWriteError() {
MutexLock l(&mutex_);
if (!enable_metadata_write_error_injection_ ||
!metadata_write_error_one_in_ ||
!write_error_rand_.OneIn(metadata_write_error_one_in_)) {
return IOStatus::OK();
{
MutexLock l(&mutex_);
if (!enable_metadata_write_error_injection_ ||
!metadata_write_error_one_in_ ||
!write_error_rand_.OneIn(metadata_write_error_one_in_)) {
return IOStatus::OK();
}
}
TEST_SYNC_POINT("FaultInjectionTestFS::InjectMetadataWriteError:Injected");
return IOStatus::IOError();
}

Expand Down