feat: add TopologySpreadConstraints section to cluster spec (#2202)

Since Kubernetes 1.19 the Topology Constraint went to GA, meaning that we can start using it, but this wasn't needed before, now with the latest version 1.26 and 1.27 many things were improved and now we can implement Topology Constraints. Some features may not work since they're available only on 1.26 or above, for any reference please read https://kubernetes.io/docs/concepts/scheduling-eviction/topology-spread-constraints/ to get more information. Closes #2192 Signed-off-by: Armando Ruocco <armando.ruocco@enterprisedb.com> Signed-off-by: Jaime Silvela <jaime.silvela@enterprisedb.com> Co-authored-by: Jaime Silvela <jaime.silvela@enterprisedb.com> (cherry picked from commit 50601b0)
cloudnative-pg · Jun 10, 2023 · 47dfc54 · 47dfc54
1 parent 79867ee
commit 47dfc54
Show file tree

Hide file tree

Showing 10 changed files with 330 additions and 46 deletions.
diff --git a/.wordlist-en-custom.txt b/.wordlist-en-custom.txt
@@ -347,6 +347,8 @@ TOC
 TODO
 TimelineId
 TopologyKey
+TopologySpreadConstraint
+TopologySpreadConstraints
 TypedLocalObjectReference
 UID
 Uncomment
@@ -944,6 +946,7 @@ tls
 tmp
 tmpfs
 tolerations
+topologies
 topologyKey
 transactional
 transactionid

diff --git a/api/v1/cluster_types.go b/api/v1/cluster_types.go
@@ -243,6 +243,12 @@ type ClusterSpec struct {
 	// +optional
 	Affinity AffinityConfiguration `json:"affinity,omitempty"`
 
+	// TopologySpreadConstraints specifies how to spread matching pods among the given topology.
+	// More info:
+	// https://kubernetes.io/docs/concepts/scheduling-eviction/topology-spread-constraints/
+	// +optional
+	TopologySpreadConstraints []corev1.TopologySpreadConstraint `json:"topologySpreadConstraints,omitempty"`
+
 	// Resources requirements of every generated Pod. Please refer to
 	// https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/
 	// for more information.

diff --git a/api/v1/zz_generated.deepcopy.go b/api/v1/zz_generated.deepcopy.go
diff --git a/config/crd/bases/postgresql.cnpg.io_clusters.yaml b/config/crd/bases/postgresql.cnpg.io_clusters.yaml
@@ -3225,6 +3225,179 @@ spec:
                   an infinite delay
                 format: int32
                 type: integer
+              topologySpreadConstraints:
+                description: 'TopologySpreadConstraints specifies how to spread matching
+                  pods among the given topology. More info: https://kubernetes.io/docs/concepts/scheduling-eviction/topology-spread-constraints/'
+                items:
+                  description: TopologySpreadConstraint specifies how to spread matching
+                    pods among the given topology.
+                  properties:
+                    labelSelector:
+                      description: LabelSelector is used to find matching pods. Pods
+                        that match this label selector are counted to determine the
+                        number of pods in their corresponding topology domain.
+                      properties:
+                        matchExpressions:
+                          description: matchExpressions is a list of label selector
+                            requirements. The requirements are ANDed.
+                          items:
+                            description: A label selector requirement is a selector
+                              that contains values, a key, and an operator that relates
+                              the key and values.
+                            properties:
+                              key:
+                                description: key is the label key that the selector
+                                  applies to.
+                                type: string
+                              operator:
+                                description: operator represents a key's relationship
+                                  to a set of values. Valid operators are In, NotIn,
+                                  Exists and DoesNotExist.
+                                type: string
+                              values:
+                                description: values is an array of string values.
+                                  If the operator is In or NotIn, the values array
+                                  must be non-empty. If the operator is Exists or
+                                  DoesNotExist, the values array must be empty. This
+                                  array is replaced during a strategic merge patch.
+                                items:
+                                  type: string
+                                type: array
+                            required:
+                            - key
+                            - operator
+                            type: object
+                          type: array
+                        matchLabels:
+                          additionalProperties:
+                            type: string
+                          description: matchLabels is a map of {key,value} pairs.
+                            A single {key,value} in the matchLabels map is equivalent
+                            to an element of matchExpressions, whose key field is
+                            "key", the operator is "In", and the values array contains
+                            only "value". The requirements are ANDed.
+                          type: object
+                      type: object
+                      x-kubernetes-map-type: atomic
+                    matchLabelKeys:
+                      description: "MatchLabelKeys is a set of pod label keys to select
+                        the pods over which spreading will be calculated. The keys
+                        are used to lookup values from the incoming pod labels, those
+                        key-value labels are ANDed with labelSelector to select the
+                        group of existing pods over which spreading will be calculated
+                        for the incoming pod. The same key is forbidden to exist in
+                        both MatchLabelKeys and LabelSelector. MatchLabelKeys cannot
+                        be set when LabelSelector isn't set. Keys that don't exist
+                        in the incoming pod labels will be ignored. A null or empty
+                        list means only match against labelSelector. \n This is a
+                        beta field and requires the MatchLabelKeysInPodTopologySpread
+                        feature gate to be enabled (enabled by default)."
+                      items:
+                        type: string
+                      type: array
+                      x-kubernetes-list-type: atomic
+                    maxSkew:
+                      description: 'MaxSkew describes the degree to which pods may
+                        be unevenly distributed. When `whenUnsatisfiable=DoNotSchedule`,
+                        it is the maximum permitted difference between the number
+                        of matching pods in the target topology and the global minimum.
+                        The global minimum is the minimum number of matching pods
+                        in an eligible domain or zero if the number of eligible domains
+                        is less than MinDomains. For example, in a 3-zone cluster,
+                        MaxSkew is set to 1, and pods with the same labelSelector
+                        spread as 2/2/1: In this case, the global minimum is 1. |
+                        zone1 | zone2 | zone3 | |  P P  |  P P  |   P   | - if MaxSkew
+                        is 1, incoming pod can only be scheduled to zone3 to become
+                        2/2/2; scheduling it onto zone1(zone2) would make the ActualSkew(3-1)
+                        on zone1(zone2) violate MaxSkew(1). - if MaxSkew is 2, incoming
+                        pod can be scheduled onto any zone. When `whenUnsatisfiable=ScheduleAnyway`,
+                        it is used to give higher precedence to topologies that satisfy
+                        it. It''s a required field. Default value is 1 and 0 is not
+                        allowed.'
+                      format: int32
+                      type: integer
+                    minDomains:
+                      description: "MinDomains indicates a minimum number of eligible
+                        domains. When the number of eligible domains with matching
+                        topology keys is less than minDomains, Pod Topology Spread
+                        treats \"global minimum\" as 0, and then the calculation of
+                        Skew is performed. And when the number of eligible domains
+                        with matching topology keys equals or greater than minDomains,
+                        this value has no effect on scheduling. As a result, when
+                        the number of eligible domains is less than minDomains, scheduler
+                        won't schedule more than maxSkew Pods to those domains. If
+                        value is nil, the constraint behaves as if MinDomains is equal
+                        to 1. Valid values are integers greater than 0. When value
+                        is not nil, WhenUnsatisfiable must be DoNotSchedule. \n For
+                        example, in a 3-zone cluster, MaxSkew is set to 2, MinDomains
+                        is set to 5 and pods with the same labelSelector spread as
+                        2/2/2: | zone1 | zone2 | zone3 | |  P P  |  P P  |  P P  |
+                        The number of domains is less than 5(MinDomains), so \"global
+                        minimum\" is treated as 0. In this situation, new pod with
+                        the same labelSelector cannot be scheduled, because computed
+                        skew will be 3(3 - 0) if new Pod is scheduled to any of the
+                        three zones, it will violate MaxSkew. \n This is a beta field
+                        and requires the MinDomainsInPodTopologySpread feature gate
+                        to be enabled (enabled by default)."
+                      format: int32
+                      type: integer
+                    nodeAffinityPolicy:
+                      description: "NodeAffinityPolicy indicates how we will treat
+                        Pod's nodeAffinity/nodeSelector when calculating pod topology
+                        spread skew. Options are: - Honor: only nodes matching nodeAffinity/nodeSelector
+                        are included in the calculations. - Ignore: nodeAffinity/nodeSelector
+                        are ignored. All nodes are included in the calculations. \n
+                        If this value is nil, the behavior is equivalent to the Honor
+                        policy. This is a beta-level feature default enabled by the
+                        NodeInclusionPolicyInPodTopologySpread feature flag."
+                      type: string
+                    nodeTaintsPolicy:
+                      description: "NodeTaintsPolicy indicates how we will treat node
+                        taints when calculating pod topology spread skew. Options
+                        are: - Honor: nodes without taints, along with tainted nodes
+                        for which the incoming pod has a toleration, are included.
+                        - Ignore: node taints are ignored. All nodes are included.
+                        \n If this value is nil, the behavior is equivalent to the
+                        Ignore policy. This is a beta-level feature default enabled
+                        by the NodeInclusionPolicyInPodTopologySpread feature flag."
+                      type: string
+                    topologyKey:
+                      description: TopologyKey is the key of node labels. Nodes that
+                        have a label with this key and identical values are considered
+                        to be in the same topology. We consider each <key, value>
+                        as a "bucket", and try to put balanced number of pods into
+                        each bucket. We define a domain as a particular instance of
+                        a topology. Also, we define an eligible domain as a domain
+                        whose nodes meet the requirements of nodeAffinityPolicy and
+                        nodeTaintsPolicy. e.g. If TopologyKey is "kubernetes.io/hostname",
+                        each Node is a domain of that topology. And, if TopologyKey
+                        is "topology.kubernetes.io/zone", each zone is a domain of
+                        that topology. It's a required field.
+                      type: string
+                    whenUnsatisfiable:
+                      description: 'WhenUnsatisfiable indicates how to deal with a
+                        pod if it doesn''t satisfy the spread constraint. - DoNotSchedule
+                        (default) tells the scheduler not to schedule it. - ScheduleAnyway
+                        tells the scheduler to schedule the pod in any location, but
+                        giving higher precedence to topologies that would help reduce
+                        the skew. A constraint is considered "Unsatisfiable" for an
+                        incoming pod if and only if every possible node assignment
+                        for that pod would violate "MaxSkew" on some topology. For
+                        example, in a 3-zone cluster, MaxSkew is set to 1, and pods
+                        with the same labelSelector spread as 3/1/1: | zone1 | zone2
+                        | zone3 | | P P P |   P   |   P   | If WhenUnsatisfiable is
+                        set to DoNotSchedule, incoming pod can only be scheduled to
+                        zone2(zone3) to become 3/2/1(3/1/2) as ActualSkew(2-1) on
+                        zone2(zone3) satisfies MaxSkew(1). In other words, the cluster
+                        can still be imbalanced, but scheduler won''t make it *more*
+                        imbalanced. It''s a required field.'
+                      type: string
+                  required:
+                  - maxSkew
+                  - topologyKey
+                  - whenUnsatisfiable
+                  type: object
+                type: array
               walStorage:
                 description: Configuration of the storage for PostgreSQL WAL (Write-Ahead
                   Log)

diff --git a/controllers/cluster_upgrade.go b/controllers/cluster_upgrade.go
@@ -291,6 +291,10 @@ func IsPodNeedingRollout(status postgres.PostgresqlStatus, cluster *apiv1.Cluste
 		return restartRequired, false, reason
 	}
 
+	if restartRequired, reason := isPodNeedingUpdatedTopology(cluster, status.Pod); restartRequired {
+		return restartRequired, false, reason
+	}
+
 	// Detect changes in the postgres container configuration
 	for _, container := range status.Pod.Spec.Containers {
 		// we go to the next array element if it isn't the postgres container
@@ -312,6 +316,18 @@ func IsPodNeedingRollout(status postgres.PostgresqlStatus, cluster *apiv1.Cluste
 	return needingRestart, true, reason
 }
 
+func isPodNeedingUpdatedTopology(cluster *apiv1.Cluster, pod corev1.Pod) (bool, string) {
+	if reflect.DeepEqual(cluster.Spec.TopologySpreadConstraints, pod.Spec.TopologySpreadConstraints) {
+		return false, ""
+	}
+	reason := fmt.Sprintf(
+		"Pod '%s' does not have up-to-date TopologySpreadConstraints. It needs to match the cluster's constraints.",
+		pod.Name,
+	)
+
+	return true, reason
+}
+
 // isPodNeedingUpdatedScheduler returns a boolean indicating if a restart is required and the relative message
 func isPodNeedingUpdatedScheduler(cluster *apiv1.Cluster, pod corev1.Pod) (bool, string) {
 	if cluster.Spec.SchedulerName == "" || cluster.Spec.SchedulerName == pod.Spec.SchedulerName {

diff --git a/controllers/cluster_upgrade_test.go b/controllers/cluster_upgrade_test.go
@@ -18,6 +18,7 @@ package controllers
 
 import (
 	corev1 "k8s.io/api/core/v1"
+	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 
 	apiv1 "github.com/cloudnative-pg/cloudnative-pg/api/v1"
 	"github.com/cloudnative-pg/cloudnative-pg/pkg/postgres"
@@ -133,3 +134,72 @@ var _ = Describe("Pod upgrade", func() {
 		})
 	})
 })
+
+var _ = Describe("Test isPodNeedingUpdatedTopology", func() {
+	var cluster *apiv1.Cluster
+	var pod corev1.Pod
+
+	BeforeEach(func() {
+		topology := corev1.TopologySpreadConstraint{
+			MaxSkew:           1,
+			TopologyKey:       "zone",
+			WhenUnsatisfiable: corev1.DoNotSchedule,
+			LabelSelector: &metav1.LabelSelector{
+				MatchLabels: map[string]string{"app": "test-app"},
+			},
+		}
+		cluster = &apiv1.Cluster{
+			Spec: apiv1.ClusterSpec{
+				TopologySpreadConstraints: []corev1.TopologySpreadConstraint{topology},
+			},
+		}
+		pod = corev1.Pod{
+			ObjectMeta: metav1.ObjectMeta{
+				Name: "test-pod",
+			},
+			Spec: corev1.PodSpec{
+				TopologySpreadConstraints: []corev1.TopologySpreadConstraint{topology},
+			},
+		}
+	})
+
+	It("should return false when the cluster and pod have the same TopologySpreadConstraints", func() {
+		needsUpdate, reason := isPodNeedingUpdatedTopology(cluster, pod)
+		Expect(needsUpdate).To(BeFalse())
+		Expect(reason).To(BeEmpty())
+	})
+
+	It("should return true when the cluster and pod do not have the same TopologySpreadConstraints", func() {
+		pod.Spec.TopologySpreadConstraints[0].MaxSkew = 2
+		needsUpdate, reason := isPodNeedingUpdatedTopology(cluster, pod)
+		Expect(needsUpdate).To(BeTrue())
+		Expect(reason).ToNot(BeEmpty())
+	})
+
+	It("should return true when the LabelSelector maps are different", func() {
+		pod.Spec.TopologySpreadConstraints[0].LabelSelector = &metav1.LabelSelector{
+			MatchLabels: map[string]string{"app": "different-app"},
+		}
+
+		needsUpdate, reason := isPodNeedingUpdatedTopology(cluster, pod)
+		Expect(needsUpdate).To(BeTrue())
+		Expect(reason).ToNot(BeEmpty())
+	})
+
+	It("should return true when TopologySpreadConstraints is nil in one of the objects", func() {
+		pod.Spec.TopologySpreadConstraints = nil
+
+		needsUpdate, reason := isPodNeedingUpdatedTopology(cluster, pod)
+		Expect(needsUpdate).To(BeTrue())
+		Expect(reason).ToNot(BeEmpty())
+	})
+
+	It("should return false if both are nil", func() {
+		cluster.Spec.TopologySpreadConstraints = nil
+		pod.Spec.TopologySpreadConstraints = nil
+
+		needsUpdate, reason := isPodNeedingUpdatedTopology(cluster, pod)
+		Expect(needsUpdate).To(BeFalse())
+		Expect(reason).To(BeEmpty())
+	})
+})