Skip to content

Commit

Permalink
feat: change default value of startDelay to 3600 (#2847)
Browse files Browse the repository at this point in the history
IMPORTANT: The default value of `startDelay` has now been
changed to 3600 seconds, equivalent of 1 hour.

Previously, it was set to 30 seconds.

This patch also replaces the livenessProbe's initial delay with a
more proper Kubernetes startup probe to deal with the start of
a Postgres server. Both the startup probe and the higher default
time should improve the self-healing capabilities of larger Postgres
deployments where the default value of 30 seconds was not
sufficient for example to complete a crash recovery operation,
causing infinite restarts.

Closes #2843 

Signed-off-by: Tao Li <tao.li@enterprisedb.com>
Signed-off-by: YanniHu1996 <yantian.hu@enterprisedb.com>
Signed-off-by: Armando Ruocco <armando.ruocco@enterprisedb.com>
Signed-off-by: Gabriele Bartolini <gabriele.bartolini@enterprisedb.com>
Co-authored-by: YanniHu1996 <yantian.hu@enterprisedb.com>
Co-authored-by: Armando Ruocco <armando.ruocco@enterprisedb.com>
Co-authored-by: Gabriele Bartolini <gabriele.bartolini@enterprisedb.com>
  • Loading branch information
4 people committed Sep 22, 2023
1 parent 9565f9f commit 4f4cd96
Show file tree
Hide file tree
Showing 5 changed files with 57 additions and 14 deletions.
13 changes: 10 additions & 3 deletions api/v1/cluster_types.go
Expand Up @@ -265,8 +265,10 @@ type ClusterSpec struct {
WalStorage *StorageConfiguration `json:"walStorage,omitempty"`

// The time in seconds that is allowed for a PostgreSQL instance to
// successfully start up (default 30)
// +kubebuilder:default:=30
// successfully start up (default 3600).
// The startup probe failure threshold is derived from this value using the formula:
// ceiling(startDelay / 10).
// +kubebuilder:default:=3600
// +optional
MaxStartDelay int32 `json:"startDelay,omitempty"`

Expand Down Expand Up @@ -981,6 +983,11 @@ const (
// is gracefully shutdown during a switchover.
// It is greater than one year in seconds, big enough to simulate an infinite timeout
DefaultMaxSwitchoverDelay = 3600

// DefaultStartupDelay is the default value for startupDelay, startupDelay will be used to calculate the
// FailureThreshold of startupProbe, the formula is `FailureThreshold = ceiling(startDelay / periodSeconds)`,
// the minimum value is 1
DefaultStartupDelay = 3600
)

// PostgresConfiguration defines the PostgreSQL configuration
Expand Down Expand Up @@ -2381,7 +2388,7 @@ func (cluster *Cluster) GetMaxStartDelay() int32 {
if cluster.Spec.MaxStartDelay > 0 {
return cluster.Spec.MaxStartDelay
}
return 30
return DefaultStartupDelay
}

// GetMaxStopDelay get the amount of time PostgreSQL has to stop
Expand Down
8 changes: 5 additions & 3 deletions config/crd/bases/postgresql.cnpg.io_clusters.yaml
Expand Up @@ -3010,9 +3010,11 @@ spec:
- metadata
type: object
startDelay:
default: 30
description: The time in seconds that is allowed for a PostgreSQL
instance to successfully start up (default 30)
default: 3600
description: 'The time in seconds that is allowed for a PostgreSQL
instance to successfully start up (default 3600). The startup probe
failure threshold is derived from this value using the formula:
ceiling(startDelay / 10).'
format: int32
type: integer
stopDelay:
Expand Down
4 changes: 3 additions & 1 deletion docs/src/cloudnative-pg.v1.md
Expand Up @@ -1451,7 +1451,9 @@ user by setting it to <code>NULL</code>. Enabled by default.</p>
</td>
<td>
<p>The time in seconds that is allowed for a PostgreSQL instance to
successfully start up (default 30)</p>
successfully start up (default 3600)
The startup probe failure threshold is derived from this value using the formula:
ceiling(startDelay / 10).</p>
</td>
</tr>
<tr><td><code>stopDelay</code><br/>
Expand Down
36 changes: 29 additions & 7 deletions pkg/specs/pods.go
Expand Up @@ -21,6 +21,7 @@ package specs
import (
"encoding/json"
"fmt"
"math"
"reflect"
"strconv"

Expand Down Expand Up @@ -81,6 +82,12 @@ const (

// ReadinessProbePeriod is the period set for the postgres instance readiness probe
ReadinessProbePeriod = 10

// StartupProbePeriod is the period set for the postgres instance startup probe
StartupProbePeriod = 10

// LivenessProbePeriod is the period set for the postgres instance liveness probe
LivenessProbePeriod = 10
)

// EnvConfig carries the environment configuration of a container
Expand Down Expand Up @@ -184,6 +191,17 @@ func createPostgresContainers(cluster apiv1.Cluster, envConfig EnvConfig) []core
Env: envConfig.EnvVars,
EnvFrom: envConfig.EnvFrom,
VolumeMounts: createPostgresVolumeMounts(cluster),
StartupProbe: &corev1.Probe{
FailureThreshold: getStartupProbeFailureThreshold(cluster.GetMaxStartDelay()),
PeriodSeconds: StartupProbePeriod,
TimeoutSeconds: 5,
ProbeHandler: corev1.ProbeHandler{
HTTPGet: &corev1.HTTPGetAction{
Path: url.PathHealth,
Port: intstr.FromInt32(int32(url.StatusPort)),
},
},
},
ReadinessProbe: &corev1.Probe{
TimeoutSeconds: 5,
PeriodSeconds: ReadinessProbePeriod,
Expand All @@ -194,14 +212,9 @@ func createPostgresContainers(cluster apiv1.Cluster, envConfig EnvConfig) []core
},
},
},
// From K8s 1.17 and newer, startup probes will be available for
// all users and not just protected from feature gates. For now
// let's use the LivenessProbe. When we will drop support for K8s
// 1.16, we'll configure a StartupProbe and this will lead to a
// better LivenessProbe (without InitialDelaySeconds).
LivenessProbe: &corev1.Probe{
InitialDelaySeconds: cluster.GetMaxStartDelay(),
TimeoutSeconds: 5,
PeriodSeconds: LivenessProbePeriod,
TimeoutSeconds: 5,
ProbeHandler: corev1.ProbeHandler{
HTTPGet: &corev1.HTTPGetAction{
Path: url.PathHealth,
Expand Down Expand Up @@ -241,6 +254,15 @@ func createPostgresContainers(cluster apiv1.Cluster, envConfig EnvConfig) []core
return containers
}

// getStartupProbeFailureThreshold get the startup probe failure threshold
// FAILURE_THRESHOLD = ceil(startDelay / periodSeconds) and minimum value is 1
func getStartupProbeFailureThreshold(startupDelay int32) int32 {
if startupDelay <= StartupProbePeriod {
return 1
}
return int32(math.Ceil(float64(startupDelay) / float64(StartupProbePeriod)))
}

// CreateAffinitySection creates the affinity sections for Pods, given the configuration
// from the user
func CreateAffinitySection(clusterName string, config apiv1.AffinityConfiguration) *corev1.Affinity {
Expand Down
10 changes: 10 additions & 0 deletions pkg/specs/pods_test.go
Expand Up @@ -829,3 +829,13 @@ var _ = Describe("PodSpec drift detection", func() {
Expect(specsMatch).To(BeFalse())
})
})

var _ = Describe("Compute startup probe failure threshold", func() {
It("should take the minimum value 1", func() {
Expect(getStartupProbeFailureThreshold(5)).To(BeNumerically("==", 1))
})

It("should take the value from 'startDelay / periodSeconds'", func() {
Expect(getStartupProbeFailureThreshold(109)).To(BeNumerically("==", 11))
})
})

0 comments on commit 4f4cd96

Please sign in to comment.