Skip to content

Commit

Permalink
fix: make SlaveDB stay in WaitDBSync state instead of sink into Error…
Browse files Browse the repository at this point in the history
… State if rsync init failed (OpenAtomFoundation#2667)

* make pika Slave DB stay in WaitDBSync state if rsync init failed(Pull meta from master timeout) to ensure the slave DB will continue to retry,instead of sinking into Error state

* add MaxRetryCount

---------

Co-authored-by: cjh <1271435567@qq.com>
  • Loading branch information
cheniujh and cheniujh committed Jun 3, 2024
1 parent 7ea90b7 commit 44112d4
Show file tree
Hide file tree
Showing 4 changed files with 13 additions and 6 deletions.
1 change: 1 addition & 0 deletions include/pika_define.h
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ const std::string kDefaultRsyncAuth = "default";

/* Rsync */
const int kMaxRsyncParallelNum = 4;
constexpr int kMaxRsyncInitReTryTimes = 64;

struct DBStruct {
DBStruct(std::string tn, int32_t inst_num)
Expand Down
1 change: 1 addition & 0 deletions include/pika_rm.h
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,7 @@ class SyncSlaveDB : public SyncDB {

private:
std::unique_ptr<rsync::RsyncClient> rsync_cli_;
int32_t rsync_init_retry_count_{0};
pstd::Mutex db_mu_;
RmNode m_info_;
ReplState repl_state_{kNoConnect};
Expand Down
15 changes: 10 additions & 5 deletions src/pika_rm.cc
Original file line number Diff line number Diff line change
Expand Up @@ -507,13 +507,19 @@ pstd::Status SyncSlaveDB::ActivateRsync() {
if (!rsync_cli_->IsIdle()) {
return s;
}
LOG(WARNING) << "ActivateRsync ...";
LOG(WARNING) << "Slave DB: " << DBName() << " Activating Rsync ... (retry count:" << rsync_init_retry_count_ << ")";
if (rsync_cli_->Init()) {
rsync_init_retry_count_ = 0;
rsync_cli_->Start();
return s;
} else {
SetReplState(ReplState::kError);
return Status::Error("rsync client init failed!");;
rsync_init_retry_count_ += 1;
if (rsync_init_retry_count_ >= kMaxRsyncInitReTryTimes) {
SetReplState(ReplState::kError);
LOG(ERROR) << "Full Sync Stage - Rsync Init failed: Slave failed to pull meta info(generated by bgsave task in Master) from Master after MaxRsyncInitReTryTimes("
<< kMaxRsyncInitReTryTimes << " times) is reached. This usually means the Master's bgsave task has costed an unexpected-long time.";
}
return Status::Error("rsync client init failed!");
}
}

Expand Down Expand Up @@ -977,8 +983,7 @@ Status PikaReplicaManager::RunSyncSlaveDBStateMachine() {
} else if (s_db->State() == ReplState::kWaitDBSync) {
Status s = s_db->ActivateRsync();
if (!s.ok()) {
g_pika_server->SetForceFullSync(true);
LOG(WARNING) << "Slave DB: " << s_db->DBName() << " rsync failed! full synchronization will be retried later, error info:" << s.ToString();
LOG(WARNING) << "Slave DB: " << s_db->DBName() << " rsync failed! full synchronization will be retried later";
continue;
}

Expand Down
2 changes: 1 addition & 1 deletion tests/integration/rsync_dynamic_reconfig.go
Original file line number Diff line number Diff line change
Expand Up @@ -136,7 +136,7 @@ var _ = Describe("Rsync Reconfig Test", func() {
slave1.FlushDB(ctx)
master1.FlushDB(ctx)
time.Sleep(3 * time.Second)
RefillMaster(MASTERADDR, 128, ctx)
RefillMaster(MASTERADDR, 64, ctx)
key1 := "45vs45f4s5d6"
value1 := "afd54g5s4f545"
//set key before sync happened, slave is supposed to fetch it when sync done
Expand Down

0 comments on commit 44112d4

Please sign in to comment.