Skip to content

Commit

Permalink
feat: add TopologySpreadConstraints section to cluster spec (#2202)
Browse files Browse the repository at this point in the history
Since Kubernetes 1.19 the Topology Constraint went to GA, meaning that we can
start using it, but this wasn't needed before, now with the latest version 1.26 and 1.27
many things were improved and now we can implement Topology Constraints.

Some features may not work since they're available only on 1.26 or above, for any reference
please read https://kubernetes.io/docs/concepts/scheduling-eviction/topology-spread-constraints/
to get more information.

Closes #2192

Signed-off-by: Armando Ruocco <armando.ruocco@enterprisedb.com>
Signed-off-by: Jaime Silvela <jaime.silvela@enterprisedb.com>
Co-authored-by: Jaime Silvela <jaime.silvela@enterprisedb.com>
(cherry picked from commit 50601b0)
  • Loading branch information
armru committed Jun 10, 2023
1 parent 79867ee commit 47dfc54
Show file tree
Hide file tree
Showing 10 changed files with 330 additions and 46 deletions.
3 changes: 3 additions & 0 deletions .wordlist-en-custom.txt
Expand Up @@ -347,6 +347,8 @@ TOC
TODO
TimelineId
TopologyKey
TopologySpreadConstraint
TopologySpreadConstraints
TypedLocalObjectReference
UID
Uncomment
Expand Down Expand Up @@ -944,6 +946,7 @@ tls
tmp
tmpfs
tolerations
topologies
topologyKey
transactional
transactionid
Expand Down
6 changes: 6 additions & 0 deletions api/v1/cluster_types.go
Expand Up @@ -243,6 +243,12 @@ type ClusterSpec struct {
// +optional
Affinity AffinityConfiguration `json:"affinity,omitempty"`

// TopologySpreadConstraints specifies how to spread matching pods among the given topology.
// More info:
// https://kubernetes.io/docs/concepts/scheduling-eviction/topology-spread-constraints/
// +optional
TopologySpreadConstraints []corev1.TopologySpreadConstraint `json:"topologySpreadConstraints,omitempty"`

// Resources requirements of every generated Pod. Please refer to
// https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/
// for more information.
Expand Down
7 changes: 7 additions & 0 deletions api/v1/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

173 changes: 173 additions & 0 deletions config/crd/bases/postgresql.cnpg.io_clusters.yaml
Expand Up @@ -3225,6 +3225,179 @@ spec:
an infinite delay
format: int32
type: integer
topologySpreadConstraints:
description: 'TopologySpreadConstraints specifies how to spread matching
pods among the given topology. More info: https://kubernetes.io/docs/concepts/scheduling-eviction/topology-spread-constraints/'
items:
description: TopologySpreadConstraint specifies how to spread matching
pods among the given topology.
properties:
labelSelector:
description: LabelSelector is used to find matching pods. Pods
that match this label selector are counted to determine the
number of pods in their corresponding topology domain.
properties:
matchExpressions:
description: matchExpressions is a list of label selector
requirements. The requirements are ANDed.
items:
description: A label selector requirement is a selector
that contains values, a key, and an operator that relates
the key and values.
properties:
key:
description: key is the label key that the selector
applies to.
type: string
operator:
description: operator represents a key's relationship
to a set of values. Valid operators are In, NotIn,
Exists and DoesNotExist.
type: string
values:
description: values is an array of string values.
If the operator is In or NotIn, the values array
must be non-empty. If the operator is Exists or
DoesNotExist, the values array must be empty. This
array is replaced during a strategic merge patch.
items:
type: string
type: array
required:
- key
- operator
type: object
type: array
matchLabels:
additionalProperties:
type: string
description: matchLabels is a map of {key,value} pairs.
A single {key,value} in the matchLabels map is equivalent
to an element of matchExpressions, whose key field is
"key", the operator is "In", and the values array contains
only "value". The requirements are ANDed.
type: object
type: object
x-kubernetes-map-type: atomic
matchLabelKeys:
description: "MatchLabelKeys is a set of pod label keys to select
the pods over which spreading will be calculated. The keys
are used to lookup values from the incoming pod labels, those
key-value labels are ANDed with labelSelector to select the
group of existing pods over which spreading will be calculated
for the incoming pod. The same key is forbidden to exist in
both MatchLabelKeys and LabelSelector. MatchLabelKeys cannot
be set when LabelSelector isn't set. Keys that don't exist
in the incoming pod labels will be ignored. A null or empty
list means only match against labelSelector. \n This is a
beta field and requires the MatchLabelKeysInPodTopologySpread
feature gate to be enabled (enabled by default)."
items:
type: string
type: array
x-kubernetes-list-type: atomic
maxSkew:
description: 'MaxSkew describes the degree to which pods may
be unevenly distributed. When `whenUnsatisfiable=DoNotSchedule`,
it is the maximum permitted difference between the number
of matching pods in the target topology and the global minimum.
The global minimum is the minimum number of matching pods
in an eligible domain or zero if the number of eligible domains
is less than MinDomains. For example, in a 3-zone cluster,
MaxSkew is set to 1, and pods with the same labelSelector
spread as 2/2/1: In this case, the global minimum is 1. |
zone1 | zone2 | zone3 | | P P | P P | P | - if MaxSkew
is 1, incoming pod can only be scheduled to zone3 to become
2/2/2; scheduling it onto zone1(zone2) would make the ActualSkew(3-1)
on zone1(zone2) violate MaxSkew(1). - if MaxSkew is 2, incoming
pod can be scheduled onto any zone. When `whenUnsatisfiable=ScheduleAnyway`,
it is used to give higher precedence to topologies that satisfy
it. It''s a required field. Default value is 1 and 0 is not
allowed.'
format: int32
type: integer
minDomains:
description: "MinDomains indicates a minimum number of eligible
domains. When the number of eligible domains with matching
topology keys is less than minDomains, Pod Topology Spread
treats \"global minimum\" as 0, and then the calculation of
Skew is performed. And when the number of eligible domains
with matching topology keys equals or greater than minDomains,
this value has no effect on scheduling. As a result, when
the number of eligible domains is less than minDomains, scheduler
won't schedule more than maxSkew Pods to those domains. If
value is nil, the constraint behaves as if MinDomains is equal
to 1. Valid values are integers greater than 0. When value
is not nil, WhenUnsatisfiable must be DoNotSchedule. \n For
example, in a 3-zone cluster, MaxSkew is set to 2, MinDomains
is set to 5 and pods with the same labelSelector spread as
2/2/2: | zone1 | zone2 | zone3 | | P P | P P | P P |
The number of domains is less than 5(MinDomains), so \"global
minimum\" is treated as 0. In this situation, new pod with
the same labelSelector cannot be scheduled, because computed
skew will be 3(3 - 0) if new Pod is scheduled to any of the
three zones, it will violate MaxSkew. \n This is a beta field
and requires the MinDomainsInPodTopologySpread feature gate
to be enabled (enabled by default)."
format: int32
type: integer
nodeAffinityPolicy:
description: "NodeAffinityPolicy indicates how we will treat
Pod's nodeAffinity/nodeSelector when calculating pod topology
spread skew. Options are: - Honor: only nodes matching nodeAffinity/nodeSelector
are included in the calculations. - Ignore: nodeAffinity/nodeSelector
are ignored. All nodes are included in the calculations. \n
If this value is nil, the behavior is equivalent to the Honor
policy. This is a beta-level feature default enabled by the
NodeInclusionPolicyInPodTopologySpread feature flag."
type: string
nodeTaintsPolicy:
description: "NodeTaintsPolicy indicates how we will treat node
taints when calculating pod topology spread skew. Options
are: - Honor: nodes without taints, along with tainted nodes
for which the incoming pod has a toleration, are included.
- Ignore: node taints are ignored. All nodes are included.
\n If this value is nil, the behavior is equivalent to the
Ignore policy. This is a beta-level feature default enabled
by the NodeInclusionPolicyInPodTopologySpread feature flag."
type: string
topologyKey:
description: TopologyKey is the key of node labels. Nodes that
have a label with this key and identical values are considered
to be in the same topology. We consider each <key, value>
as a "bucket", and try to put balanced number of pods into
each bucket. We define a domain as a particular instance of
a topology. Also, we define an eligible domain as a domain
whose nodes meet the requirements of nodeAffinityPolicy and
nodeTaintsPolicy. e.g. If TopologyKey is "kubernetes.io/hostname",
each Node is a domain of that topology. And, if TopologyKey
is "topology.kubernetes.io/zone", each zone is a domain of
that topology. It's a required field.
type: string
whenUnsatisfiable:
description: 'WhenUnsatisfiable indicates how to deal with a
pod if it doesn''t satisfy the spread constraint. - DoNotSchedule
(default) tells the scheduler not to schedule it. - ScheduleAnyway
tells the scheduler to schedule the pod in any location, but
giving higher precedence to topologies that would help reduce
the skew. A constraint is considered "Unsatisfiable" for an
incoming pod if and only if every possible node assignment
for that pod would violate "MaxSkew" on some topology. For
example, in a 3-zone cluster, MaxSkew is set to 1, and pods
with the same labelSelector spread as 3/1/1: | zone1 | zone2
| zone3 | | P P P | P | P | If WhenUnsatisfiable is
set to DoNotSchedule, incoming pod can only be scheduled to
zone2(zone3) to become 3/2/1(3/1/2) as ActualSkew(2-1) on
zone2(zone3) satisfies MaxSkew(1). In other words, the cluster
can still be imbalanced, but scheduler won''t make it *more*
imbalanced. It''s a required field.'
type: string
required:
- maxSkew
- topologyKey
- whenUnsatisfiable
type: object
type: array
walStorage:
description: Configuration of the storage for PostgreSQL WAL (Write-Ahead
Log)
Expand Down
16 changes: 16 additions & 0 deletions controllers/cluster_upgrade.go
Expand Up @@ -291,6 +291,10 @@ func IsPodNeedingRollout(status postgres.PostgresqlStatus, cluster *apiv1.Cluste
return restartRequired, false, reason
}

if restartRequired, reason := isPodNeedingUpdatedTopology(cluster, status.Pod); restartRequired {
return restartRequired, false, reason
}

// Detect changes in the postgres container configuration
for _, container := range status.Pod.Spec.Containers {
// we go to the next array element if it isn't the postgres container
Expand All @@ -312,6 +316,18 @@ func IsPodNeedingRollout(status postgres.PostgresqlStatus, cluster *apiv1.Cluste
return needingRestart, true, reason
}

func isPodNeedingUpdatedTopology(cluster *apiv1.Cluster, pod corev1.Pod) (bool, string) {
if reflect.DeepEqual(cluster.Spec.TopologySpreadConstraints, pod.Spec.TopologySpreadConstraints) {
return false, ""
}
reason := fmt.Sprintf(
"Pod '%s' does not have up-to-date TopologySpreadConstraints. It needs to match the cluster's constraints.",
pod.Name,
)

return true, reason
}

// isPodNeedingUpdatedScheduler returns a boolean indicating if a restart is required and the relative message
func isPodNeedingUpdatedScheduler(cluster *apiv1.Cluster, pod corev1.Pod) (bool, string) {
if cluster.Spec.SchedulerName == "" || cluster.Spec.SchedulerName == pod.Spec.SchedulerName {
Expand Down
70 changes: 70 additions & 0 deletions controllers/cluster_upgrade_test.go
Expand Up @@ -18,6 +18,7 @@ package controllers

import (
corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"

apiv1 "github.com/cloudnative-pg/cloudnative-pg/api/v1"
"github.com/cloudnative-pg/cloudnative-pg/pkg/postgres"
Expand Down Expand Up @@ -133,3 +134,72 @@ var _ = Describe("Pod upgrade", func() {
})
})
})

var _ = Describe("Test isPodNeedingUpdatedTopology", func() {
var cluster *apiv1.Cluster
var pod corev1.Pod

BeforeEach(func() {
topology := corev1.TopologySpreadConstraint{
MaxSkew: 1,
TopologyKey: "zone",
WhenUnsatisfiable: corev1.DoNotSchedule,
LabelSelector: &metav1.LabelSelector{
MatchLabels: map[string]string{"app": "test-app"},
},
}
cluster = &apiv1.Cluster{
Spec: apiv1.ClusterSpec{
TopologySpreadConstraints: []corev1.TopologySpreadConstraint{topology},
},
}
pod = corev1.Pod{
ObjectMeta: metav1.ObjectMeta{
Name: "test-pod",
},
Spec: corev1.PodSpec{
TopologySpreadConstraints: []corev1.TopologySpreadConstraint{topology},
},
}
})

It("should return false when the cluster and pod have the same TopologySpreadConstraints", func() {
needsUpdate, reason := isPodNeedingUpdatedTopology(cluster, pod)
Expect(needsUpdate).To(BeFalse())
Expect(reason).To(BeEmpty())
})

It("should return true when the cluster and pod do not have the same TopologySpreadConstraints", func() {
pod.Spec.TopologySpreadConstraints[0].MaxSkew = 2
needsUpdate, reason := isPodNeedingUpdatedTopology(cluster, pod)
Expect(needsUpdate).To(BeTrue())
Expect(reason).ToNot(BeEmpty())
})

It("should return true when the LabelSelector maps are different", func() {
pod.Spec.TopologySpreadConstraints[0].LabelSelector = &metav1.LabelSelector{
MatchLabels: map[string]string{"app": "different-app"},
}

needsUpdate, reason := isPodNeedingUpdatedTopology(cluster, pod)
Expect(needsUpdate).To(BeTrue())
Expect(reason).ToNot(BeEmpty())
})

It("should return true when TopologySpreadConstraints is nil in one of the objects", func() {
pod.Spec.TopologySpreadConstraints = nil

needsUpdate, reason := isPodNeedingUpdatedTopology(cluster, pod)
Expect(needsUpdate).To(BeTrue())
Expect(reason).ToNot(BeEmpty())
})

It("should return false if both are nil", func() {
cluster.Spec.TopologySpreadConstraints = nil
pod.Spec.TopologySpreadConstraints = nil

needsUpdate, reason := isPodNeedingUpdatedTopology(cluster, pod)
Expect(needsUpdate).To(BeFalse())
Expect(reason).To(BeEmpty())
})
})

0 comments on commit 47dfc54

Please sign in to comment.