From 4a92ac5ce2bd888dd3a1414bb632e635af058a47 Mon Sep 17 00:00:00 2001
From: Neil Huang <neil.huang@couchbase.com>
Date: Mon, 28 Jun 2021 16:30:15 -0700
Subject: [PATCH] MB-47156 - make health check interval configurable

Change-Id: I47fc1c2d55badeaa8687999a1bb4da6bcccaf92f
Reviewed-on: http://review.couchbase.org/c/goxdcr/+/156636
Tested-by: Neil Huang <neil.huang@couchbase.com>
Reviewed-by: Neil Huang <neil.huang@couchbase.com>
---
 base/constant.go                           |  8 +++++++-
 metadata/internal_settings.go              |  7 +++++++
 pipeline_svc/pipeline_supervisor.go        | 10 ++++------
 replication_manager/replication_manager.go |  2 ++
 4 files changed, 20 insertions(+), 7 deletions(-)

diff --git a/base/constant.go b/base/constant.go
index acd2ea2f..6cde239c 100644
--- a/base/constant.go
+++ b/base/constant.go
@@ -873,6 +873,9 @@ var RemoteClusterAlternateAddrChangeCnt = 5
 
 var ResourceMgrKVDetectionRetryInterval = 60 * time.Second
 
+var HealthCheckInterval = 120 * time.Second
+var HealthCheckTimeout = 10 * time.Second
+
 func InitConstants(topologyChangeCheckInterval time.Duration, maxTopologyChangeCountBeforeRestart,
 	maxTopologyStableCountBeforeRestart, maxWorkersForCheckpointing int,
 	timeoutCheckpointBeforeStop time.Duration, capiDataChanSizeMultiplier int,
@@ -916,7 +919,8 @@ func InitConstants(topologyChangeCheckInterval time.Duration, maxTopologyChangeC
 	thresholdRatioForProcessCpu int, thresholdRatioForTotalCpu int,
 	maxCountCpuNotMaxed int, maxCountThroughputDrop int,
 	filteringInternalKey string, filteringInternalXattr string,
-	remoteClusterAlternateAddrChangeCnt int, resourceMgrKVDetectionRetryInterval time.Duration) {
+	remoteClusterAlternateAddrChangeCnt int, resourceMgrKVDetectionRetryInterval time.Duration,
+	healthCheckInterval time.Duration, healthCheckTimeout time.Duration) {
 	TopologyChangeCheckInterval = topologyChangeCheckInterval
 	MaxTopologyChangeCountBeforeRestart = maxTopologyChangeCountBeforeRestart
 	MaxTopologyStableCountBeforeRestart = maxTopologyStableCountBeforeRestart
@@ -1024,6 +1028,8 @@ func InitConstants(topologyChangeCheckInterval time.Duration, maxTopologyChangeC
 	}
 	RemoteClusterAlternateAddrChangeCnt = remoteClusterAlternateAddrChangeCnt
 	ResourceMgrKVDetectionRetryInterval = resourceMgrKVDetectionRetryInterval
+	HealthCheckInterval = healthCheckInterval
+	HealthCheckTimeout = healthCheckTimeout
 }
 
 // Need to escape the () to result in "META().xattrs" literal
diff --git a/metadata/internal_settings.go b/metadata/internal_settings.go
index dfada378..1b21ee95 100644
--- a/metadata/internal_settings.go
+++ b/metadata/internal_settings.go
@@ -152,6 +152,9 @@ const (
 	BypassSanInCertificateCheckKey = "BypassSanInCertificateCheck"
 	// Number of times to verify bucket is missing before removing an invalid replicationSpec
 	ReplicationSpecGCCntKey = "ReplicationSpecGCCnt"
+	// Pipeline Supervisor health check interval
+	HealthCheckIntervalKey = "HealthCheckIntervalSec"
+	HealthCheckTimeoutKey  = "HealthCheckTimeoutSec"
 
 	TimeoutRuntimeContextStartKey = "TimeoutRuntimeContextStart"
 	TimeoutRuntimeContextStopKey  = "TimeoutRuntimeContextStop"
@@ -302,6 +305,8 @@ var FilteringInternalKeyConfig = &SettingsConfig{base.InternalKeyKey, nil}
 var FilteringInternalXattrConfig = &SettingsConfig{base.InternalKeyXattr, nil}
 var RemoteClusterAlternateAddrChangeConfig = &SettingsConfig{base.RemoteClusterAlternateAddrChangeCnt, &Range{1, 1000}}
 var ResourceMgrKVDetectionRetryIntervalConfig = &SettingsConfig{int(base.ResourceMgrKVDetectionRetryInterval / time.Second), &Range{1, 3600}}
+var HealthCheckIntervalConfig = &SettingsConfig{int(base.HealthCheckInterval / time.Second), &Range{5, 3600 /*1 hour*/}}
+var HealthCheckTimeoutConfig = &SettingsConfig{int(base.HealthCheckTimeout / time.Second), &Range{5, 3600 /*1 hour*/}}
 
 var XDCRInternalSettingsConfigMap = map[string]*SettingsConfig{
 	TopologyChangeCheckIntervalKey:                TopologyChangeCheckIntervalConfig,
@@ -392,6 +397,8 @@ var XDCRInternalSettingsConfigMap = map[string]*SettingsConfig{
 	FilteringInternalXattr:                        FilteringInternalXattrConfig,
 	RemoteClusterAlternateAddrChangeKey:           RemoteClusterAlternateAddrChangeConfig,
 	ResourceMgrKVDetectionRetryIntervalKey:        ResourceMgrKVDetectionRetryIntervalConfig,
+	HealthCheckIntervalKey:                        HealthCheckIntervalConfig,
+	HealthCheckTimeoutKey:                         HealthCheckTimeoutConfig,
 }
 
 func InitConstants(xmemMaxIdleCountLowerBound int, xmemMaxIdleCountUpperBound int) {
diff --git a/pipeline_svc/pipeline_supervisor.go b/pipeline_svc/pipeline_supervisor.go
index 02d9b8a3..72834bbe 100644
--- a/pipeline_svc/pipeline_supervisor.go
+++ b/pipeline_svc/pipeline_supervisor.go
@@ -37,7 +37,6 @@ const (
 )
 
 const (
-	health_check_interval          = 120 * time.Second
 	default_max_dcp_miss_count     = 10
 	filterErrCheckAndPrintInterval = 5 * time.Second
 	maxFilterErrorsPerInterval     = 20
@@ -141,7 +140,7 @@ func (pipelineSupervisor *PipelineSupervisor) setMaxDcpMissCount(settings metada
 	// before we declare the pipeline to be broken
 	var max_dcp_miss_count int
 	stats_update_interval := StatsUpdateInterval(settings)
-	number_of_waits_to_ensure_stats_update := int(stats_update_interval.Nanoseconds()/health_check_interval.Nanoseconds()) + 1
+	number_of_waits_to_ensure_stats_update := int(stats_update_interval.Nanoseconds()/base.HealthCheckInterval.Nanoseconds()) + 1
 	if number_of_waits_to_ensure_stats_update < default_max_dcp_miss_count {
 		max_dcp_miss_count = default_max_dcp_miss_count
 	} else {
@@ -181,11 +180,11 @@ func (pipelineSupervisor *PipelineSupervisor) closeConnections() {
 }
 
 func (pipelineSupervisor *PipelineSupervisor) monitorPipelineHealth() error {
-	pipelineSupervisor.Logger().Infof("%v monitorPipelineHealth started", pipelineSupervisor.Id())
+	pipelineSupervisor.Logger().Infof("%v monitorPipelineHealth started with interval: %v timeout: %v", pipelineSupervisor.Id(), base.HealthCheckInterval, base.HealthCheckTimeout)
 
 	defer pipelineSupervisor.GenericSupervisor.ChidrenWaitGroup().Done()
 
-	health_check_ticker := time.NewTicker(health_check_interval)
+	health_check_ticker := time.NewTicker(base.HealthCheckInterval)
 	defer health_check_ticker.Stop()
 
 	fin_ch := pipelineSupervisor.GenericSupervisor.FinishChannel()
@@ -196,7 +195,7 @@ func (pipelineSupervisor *PipelineSupervisor) monitorPipelineHealth() error {
 			pipelineSupervisor.Logger().Infof("monitorPipelineHealth routine is exiting because parent supervisor %v has been stopped\n", pipelineSupervisor.Id())
 			return nil
 		case <-health_check_ticker.C:
-			err := base.ExecWithTimeout(pipelineSupervisor.checkPipelineHealth, 1000*time.Millisecond, pipelineSupervisor.Logger())
+			err := base.ExecWithTimeout(pipelineSupervisor.checkPipelineHealth, base.HealthCheckTimeout, pipelineSupervisor.Logger())
 			if err != nil {
 				if err == base.ErrorExecutionTimedOut {
 					// ignore timeout error and continue
@@ -207,7 +206,6 @@ func (pipelineSupervisor *PipelineSupervisor) monitorPipelineHealth() error {
 			}
 		}
 	}
-	return nil
 }
 
 func (pipelineSupervisor *PipelineSupervisor) logFilterError(err error, desc string) {
diff --git a/replication_manager/replication_manager.go b/replication_manager/replication_manager.go
index 4b6b5961..ce45e655 100644
--- a/replication_manager/replication_manager.go
+++ b/replication_manager/replication_manager.go
@@ -290,6 +290,8 @@ func initConstants(xdcr_topology_svc service_def.XDCRCompTopologySvc, internal_s
 		internal_settings.Values[metadata.FilteringInternalXattr].(string),
 		internal_settings.Values[metadata.RemoteClusterAlternateAddrChangeKey].(int),
 		time.Duration(internal_settings.Values[metadata.ResourceMgrKVDetectionRetryIntervalKey].(int))*time.Second,
+		time.Duration(internal_settings.Values[metadata.HealthCheckIntervalKey].(int))*time.Second,
+		time.Duration(internal_settings.Values[metadata.HealthCheckTimeoutKey].(int))*time.Second,
 	)
 }