From 4a92ac5ce2bd888dd3a1414bb632e635af058a47 Mon Sep 17 00:00:00 2001 From: Neil Huang Date: Mon, 28 Jun 2021 16:30:15 -0700 Subject: [PATCH] MB-47156 - make health check interval configurable Change-Id: I47fc1c2d55badeaa8687999a1bb4da6bcccaf92f Reviewed-on: http://review.couchbase.org/c/goxdcr/+/156636 Tested-by: Neil Huang Reviewed-by: Neil Huang --- base/constant.go | 8 +++++++- metadata/internal_settings.go | 7 +++++++ pipeline_svc/pipeline_supervisor.go | 10 ++++------ replication_manager/replication_manager.go | 2 ++ 4 files changed, 20 insertions(+), 7 deletions(-) diff --git a/base/constant.go b/base/constant.go index acd2ea2f..6cde239c 100644 --- a/base/constant.go +++ b/base/constant.go @@ -873,6 +873,9 @@ var RemoteClusterAlternateAddrChangeCnt = 5 var ResourceMgrKVDetectionRetryInterval = 60 * time.Second +var HealthCheckInterval = 120 * time.Second +var HealthCheckTimeout = 10 * time.Second + func InitConstants(topologyChangeCheckInterval time.Duration, maxTopologyChangeCountBeforeRestart, maxTopologyStableCountBeforeRestart, maxWorkersForCheckpointing int, timeoutCheckpointBeforeStop time.Duration, capiDataChanSizeMultiplier int, @@ -916,7 +919,8 @@ func InitConstants(topologyChangeCheckInterval time.Duration, maxTopologyChangeC thresholdRatioForProcessCpu int, thresholdRatioForTotalCpu int, maxCountCpuNotMaxed int, maxCountThroughputDrop int, filteringInternalKey string, filteringInternalXattr string, - remoteClusterAlternateAddrChangeCnt int, resourceMgrKVDetectionRetryInterval time.Duration) { + remoteClusterAlternateAddrChangeCnt int, resourceMgrKVDetectionRetryInterval time.Duration, + healthCheckInterval time.Duration, healthCheckTimeout time.Duration) { TopologyChangeCheckInterval = topologyChangeCheckInterval MaxTopologyChangeCountBeforeRestart = maxTopologyChangeCountBeforeRestart MaxTopologyStableCountBeforeRestart = maxTopologyStableCountBeforeRestart @@ -1024,6 +1028,8 @@ func InitConstants(topologyChangeCheckInterval time.Duration, maxTopologyChangeC } RemoteClusterAlternateAddrChangeCnt = remoteClusterAlternateAddrChangeCnt ResourceMgrKVDetectionRetryInterval = resourceMgrKVDetectionRetryInterval + HealthCheckInterval = healthCheckInterval + HealthCheckTimeout = healthCheckTimeout } // Need to escape the () to result in "META().xattrs" literal diff --git a/metadata/internal_settings.go b/metadata/internal_settings.go index dfada378..1b21ee95 100644 --- a/metadata/internal_settings.go +++ b/metadata/internal_settings.go @@ -152,6 +152,9 @@ const ( BypassSanInCertificateCheckKey = "BypassSanInCertificateCheck" // Number of times to verify bucket is missing before removing an invalid replicationSpec ReplicationSpecGCCntKey = "ReplicationSpecGCCnt" + // Pipeline Supervisor health check interval + HealthCheckIntervalKey = "HealthCheckIntervalSec" + HealthCheckTimeoutKey = "HealthCheckTimeoutSec" TimeoutRuntimeContextStartKey = "TimeoutRuntimeContextStart" TimeoutRuntimeContextStopKey = "TimeoutRuntimeContextStop" @@ -302,6 +305,8 @@ var FilteringInternalKeyConfig = &SettingsConfig{base.InternalKeyKey, nil} var FilteringInternalXattrConfig = &SettingsConfig{base.InternalKeyXattr, nil} var RemoteClusterAlternateAddrChangeConfig = &SettingsConfig{base.RemoteClusterAlternateAddrChangeCnt, &Range{1, 1000}} var ResourceMgrKVDetectionRetryIntervalConfig = &SettingsConfig{int(base.ResourceMgrKVDetectionRetryInterval / time.Second), &Range{1, 3600}} +var HealthCheckIntervalConfig = &SettingsConfig{int(base.HealthCheckInterval / time.Second), &Range{5, 3600 /*1 hour*/}} +var HealthCheckTimeoutConfig = &SettingsConfig{int(base.HealthCheckTimeout / time.Second), &Range{5, 3600 /*1 hour*/}} var XDCRInternalSettingsConfigMap = map[string]*SettingsConfig{ TopologyChangeCheckIntervalKey: TopologyChangeCheckIntervalConfig, @@ -392,6 +397,8 @@ var XDCRInternalSettingsConfigMap = map[string]*SettingsConfig{ FilteringInternalXattr: FilteringInternalXattrConfig, RemoteClusterAlternateAddrChangeKey: RemoteClusterAlternateAddrChangeConfig, ResourceMgrKVDetectionRetryIntervalKey: ResourceMgrKVDetectionRetryIntervalConfig, + HealthCheckIntervalKey: HealthCheckIntervalConfig, + HealthCheckTimeoutKey: HealthCheckTimeoutConfig, } func InitConstants(xmemMaxIdleCountLowerBound int, xmemMaxIdleCountUpperBound int) { diff --git a/pipeline_svc/pipeline_supervisor.go b/pipeline_svc/pipeline_supervisor.go index 02d9b8a3..72834bbe 100644 --- a/pipeline_svc/pipeline_supervisor.go +++ b/pipeline_svc/pipeline_supervisor.go @@ -37,7 +37,6 @@ const ( ) const ( - health_check_interval = 120 * time.Second default_max_dcp_miss_count = 10 filterErrCheckAndPrintInterval = 5 * time.Second maxFilterErrorsPerInterval = 20 @@ -141,7 +140,7 @@ func (pipelineSupervisor *PipelineSupervisor) setMaxDcpMissCount(settings metada // before we declare the pipeline to be broken var max_dcp_miss_count int stats_update_interval := StatsUpdateInterval(settings) - number_of_waits_to_ensure_stats_update := int(stats_update_interval.Nanoseconds()/health_check_interval.Nanoseconds()) + 1 + number_of_waits_to_ensure_stats_update := int(stats_update_interval.Nanoseconds()/base.HealthCheckInterval.Nanoseconds()) + 1 if number_of_waits_to_ensure_stats_update < default_max_dcp_miss_count { max_dcp_miss_count = default_max_dcp_miss_count } else { @@ -181,11 +180,11 @@ func (pipelineSupervisor *PipelineSupervisor) closeConnections() { } func (pipelineSupervisor *PipelineSupervisor) monitorPipelineHealth() error { - pipelineSupervisor.Logger().Infof("%v monitorPipelineHealth started", pipelineSupervisor.Id()) + pipelineSupervisor.Logger().Infof("%v monitorPipelineHealth started with interval: %v timeout: %v", pipelineSupervisor.Id(), base.HealthCheckInterval, base.HealthCheckTimeout) defer pipelineSupervisor.GenericSupervisor.ChidrenWaitGroup().Done() - health_check_ticker := time.NewTicker(health_check_interval) + health_check_ticker := time.NewTicker(base.HealthCheckInterval) defer health_check_ticker.Stop() fin_ch := pipelineSupervisor.GenericSupervisor.FinishChannel() @@ -196,7 +195,7 @@ func (pipelineSupervisor *PipelineSupervisor) monitorPipelineHealth() error { pipelineSupervisor.Logger().Infof("monitorPipelineHealth routine is exiting because parent supervisor %v has been stopped\n", pipelineSupervisor.Id()) return nil case <-health_check_ticker.C: - err := base.ExecWithTimeout(pipelineSupervisor.checkPipelineHealth, 1000*time.Millisecond, pipelineSupervisor.Logger()) + err := base.ExecWithTimeout(pipelineSupervisor.checkPipelineHealth, base.HealthCheckTimeout, pipelineSupervisor.Logger()) if err != nil { if err == base.ErrorExecutionTimedOut { // ignore timeout error and continue @@ -207,7 +206,6 @@ func (pipelineSupervisor *PipelineSupervisor) monitorPipelineHealth() error { } } } - return nil } func (pipelineSupervisor *PipelineSupervisor) logFilterError(err error, desc string) { diff --git a/replication_manager/replication_manager.go b/replication_manager/replication_manager.go index 4b6b5961..ce45e655 100644 --- a/replication_manager/replication_manager.go +++ b/replication_manager/replication_manager.go @@ -290,6 +290,8 @@ func initConstants(xdcr_topology_svc service_def.XDCRCompTopologySvc, internal_s internal_settings.Values[metadata.FilteringInternalXattr].(string), internal_settings.Values[metadata.RemoteClusterAlternateAddrChangeKey].(int), time.Duration(internal_settings.Values[metadata.ResourceMgrKVDetectionRetryIntervalKey].(int))*time.Second, + time.Duration(internal_settings.Values[metadata.HealthCheckIntervalKey].(int))*time.Second, + time.Duration(internal_settings.Values[metadata.HealthCheckTimeoutKey].(int))*time.Second, ) }