If the user has specified cluster autoscaling behavior for their (#2754)

gameserver then don't overwrite it.
googleforgames · Sep 30, 2022 · acf760b · acf760b
1 parent 29b130f
commit acf760b
Show file tree

Hide file tree

Showing 3 changed files with 109 additions and 14 deletions.
diff --git a/pkg/apis/agones/v1/gameserver.go b/pkg/apis/agones/v1/gameserver.go
@@ -99,6 +99,10 @@ const (
 	// becomes ready, so we can track when restarts should occur and when a GameServer
 	// should be moved to Unhealthy.
 	GameServerReadyContainerIDAnnotation = agones.GroupName + "/ready-container-id"
+	// PodSafeToEvictAnnotation is an annotation that the Kubernetes cluster autoscaler uses to
+	// determine if a pod can safely be evicted to compact a cluster by moving pods between nodes
+	// and scaling down nodes.
+	PodSafeToEvictAnnotation = "cluster-autoscaler.kubernetes.io/safe-to-evict"
 )
 
 var (
@@ -632,7 +636,7 @@ func (gs *GameServer) podObjectMeta(pod *corev1.Pod) {
 		pod.ObjectMeta.Labels = make(map[string]string, 2)
 	}
 	if pod.ObjectMeta.Annotations == nil {
-		pod.ObjectMeta.Annotations = make(map[string]string, 1)
+		pod.ObjectMeta.Annotations = make(map[string]string, 2)
 	}
 	pod.ObjectMeta.Labels[RoleLabel] = GameServerLabelRole
 	// store the GameServer name as a label, for easy lookup later on
@@ -642,10 +646,13 @@ func (gs *GameServer) podObjectMeta(pod *corev1.Pod) {
 	ref := metav1.NewControllerRef(gs, SchemeGroupVersion.WithKind("GameServer"))
 	pod.ObjectMeta.OwnerReferences = append(pod.ObjectMeta.OwnerReferences, *ref)
 
-	if gs.Spec.Scheduling == apis.Packed {
-		// This means that the autoscaler cannot remove the Node that this Pod is on.
-		// (and evict the Pod in the process)
-		pod.ObjectMeta.Annotations["cluster-autoscaler.kubernetes.io/safe-to-evict"] = "false"
+	// This means that the autoscaler cannot remove the Node that this Pod is on.
+	// (and evict the Pod in the process). Only set the value if it has not already
+	// been configured in the pod template (to not override user specified behavior).
+	// We only set this for packed game servers, under the assumption that if
+	// game servers are distributed then the cluster autoscaler isn't likely running.
+	if _, exists := pod.ObjectMeta.Annotations[PodSafeToEvictAnnotation]; !exists && gs.Spec.Scheduling == apis.Packed {
+		pod.ObjectMeta.Annotations[PodSafeToEvictAnnotation] = "false"
 	}
 
 	// Add Agones version into Pod Annotations

diff --git a/pkg/apis/agones/v1/gameserver_test.go b/pkg/apis/agones/v1/gameserver_test.go
@@ -1265,7 +1265,7 @@ func TestGameServerPodObjectMeta(t *testing.T) {
 		gs.podObjectMeta(pod)
 		f(t, gs, pod)
 
-		assert.Equal(t, "false", pod.ObjectMeta.Annotations["cluster-autoscaler.kubernetes.io/safe-to-evict"])
+		assert.Equal(t, "false", pod.ObjectMeta.Annotations[PodSafeToEvictAnnotation])
 	})
 
 	t.Run("distributed", func(t *testing.T) {
@@ -1276,10 +1276,68 @@ func TestGameServerPodObjectMeta(t *testing.T) {
 		gs.podObjectMeta(pod)
 		f(t, gs, pod)
 
-		assert.Equal(t, "", pod.ObjectMeta.Annotations["cluster-autoscaler.kubernetes.io/safe-to-evict"])
+		assert.Equal(t, "", pod.ObjectMeta.Annotations[PodSafeToEvictAnnotation])
 	})
 }
 
+func TestGameServerPodAutoscalerAnnotations(t *testing.T) {
+	testCases := []struct {
+		description        string
+		scheduling         apis.SchedulingStrategy
+		setAnnotation      bool
+		expectedAnnotation string
+	}{
+		{
+			description:        "Packed",
+			scheduling:         apis.Packed,
+			expectedAnnotation: "false",
+		},
+		{
+			description:        "Distributed",
+			scheduling:         apis.Distributed,
+			expectedAnnotation: "",
+		},
+		{
+			description:        "Packed with autoscaler annotation",
+			scheduling:         apis.Packed,
+			setAnnotation:      true,
+			expectedAnnotation: "true",
+		},
+		{
+			description:        "Distributed with autoscaler annotation",
+			scheduling:         apis.Distributed,
+			setAnnotation:      true,
+			expectedAnnotation: "true",
+		},
+	}
+
+	fixture := &GameServer{
+		ObjectMeta: metav1.ObjectMeta{Name: "logan"},
+		Spec:       GameServerSpec{Container: "sheep"},
+	}
+	for _, tc := range testCases {
+		t.Run(tc.description, func(t *testing.T) {
+			gs := fixture.DeepCopy()
+			gs.Spec.Scheduling = tc.scheduling
+			if tc.setAnnotation {
+				gs.Spec.Template = corev1.PodTemplateSpec{ObjectMeta: metav1.ObjectMeta{
+					Annotations: map[string]string{PodSafeToEvictAnnotation: "true"},
+				}}
+			}
+			pod, err := gs.Pod()
+			assert.Nil(t, err, "Pod should not return an error")
+			assert.Equal(t, gs.ObjectMeta.Name, pod.ObjectMeta.Name)
+			assert.Equal(t, gs.ObjectMeta.Namespace, pod.ObjectMeta.Namespace)
+			assert.Equal(t, GameServerLabelRole, pod.ObjectMeta.Labels[RoleLabel])
+			assert.Equal(t, "gameserver", pod.ObjectMeta.Labels[agones.GroupName+"/role"])
+			assert.Equal(t, gs.ObjectMeta.Name, pod.ObjectMeta.Labels[GameServerPodLabel])
+			assert.Equal(t, "sheep", pod.ObjectMeta.Annotations[GameServerContainerAnnotation])
+			assert.True(t, metav1.IsControlledBy(pod, gs))
+			assert.Equal(t, tc.expectedAnnotation, pod.ObjectMeta.Annotations[PodSafeToEvictAnnotation])
+		})
+	}
+}
+
 func TestGameServerPodScheduling(t *testing.T) {
 	fixture := &corev1.Pod{Spec: corev1.PodSpec{}}
 

diff --git a/site/content/en/docs/Advanced/scheduling-and-autoscaling.md b/site/content/en/docs/Advanced/scheduling-and-autoscaling.md
@@ -35,7 +35,7 @@ or their cloud specific documentation.
 ## Fleet Autoscaling
 
 Fleet autoscaling is the only type of autoscaling that exists in Agones. It is currently available as a
-buffer autoscaling strategy or as a webhook driven strategy, such that you can provide your own autoscaling logic. 
+buffer autoscaling strategy or as a webhook driven strategy, such that you can provide your own autoscaling logic.
 
 Have a look at the [Create a Fleet Autoscaler]({{< relref "../Getting Started/create-fleetautoscaler.md" >}}) quickstart, the
 [Create a Webhook Fleet Autoscaler]({{< relref "../Getting Started/create-webhook-fleetautoscaler.md" >}}) quickstart,
@@ -58,7 +58,7 @@ when it is created.
 
 ### Fleet Scale Down Strategy
 
-Fleet Scale Down strategy refers to the order in which the `GameServers` that belong to a `Fleet` are deleted, 
+Fleet Scale Down strategy refers to the order in which the `GameServers` that belong to a `Fleet` are deleted,
 when Fleets are shrunk in size.
 
 ## Fleet Scheduling
@@ -86,7 +86,7 @@ spec:
             image: {{% example-image %}}
 ```
 
-This is the *default* Fleet scheduling strategy. It is designed for dynamic Kubernetes environments, wherein you wish 
+This is the *default* Fleet scheduling strategy. It is designed for dynamic Kubernetes environments, wherein you wish
 to scale up and down as load increases or decreases, such as in a Cloud environment where you are paying
 for the infrastructure you use.
 
@@ -97,13 +97,43 @@ This affects the Cluster autoscaler, Allocation Scheduling, Pod Scheduling and F
 
 #### Cluster Autoscaler
 
+{{% feature expiryVersion="1.27.0" %}}
 To ensure that the Cluster Autoscaler doesn't attempt to evict and move `GameServer` `Pods` onto new Nodes during
 gameplay, Agones adds the annotation [`"cluster-autoscaler.kubernetes.io/safe-to-evict": "false"`](https://github.com/kubernetes/autoscaler/blob/master/cluster-autoscaler/FAQ.md#what-types-of-pods-can-prevent-ca-from-removing-a-node)
 to the backing Pod.
+{{% /feature %}}
+
+{{% feature publishVersion="1.27.0" %}}
+When using the “Packed” strategy, Agones will ensure that the Cluster Autoscaler doesn't attempt to evict and move `GameServer` `Pods` onto new Nodes during
+gameplay by adding the annotation [`"cluster-autoscaler.kubernetes.io/safe-to-evict": "false"`](https://github.com/kubernetes/autoscaler/blob/master/cluster-autoscaler/FAQ.md#what-types-of-pods-can-prevent-ca-from-removing-a-node)
+to the backing Pod.
+
+However, if a gameserver can tolerate [being evicted](https://kubernetes.io/docs/concepts/scheduling-eviction/api-eviction/#how-api-initiated-eviction-works)
+(generally in combination with setting an appropriate graceful termination period on the gameserver pod) and you
+want the Cluster Autoscaler to compact your cluster by evicting game servers when it would allow the Cluster
+Autoscaler to reduce the number of nodes in the cluster, then this behavior can be overridden by explicitly setting the
+`"cluster-autoscaler.kubernetes.io/safe-to-evict"` annotation to `"true"` in the metadata for the game server pod, e.g.
+
+```
+apiVersion: "agones.dev/v1"
+kind: GameServer
+metadata:
+  name: "simple-game-server"
+spec:
+  template:
+    # pod metadata. Name & Namespace is overwritten
+    metadata:
+      annotations:
+        cluster-autoscaler.kubernetes.io/safe-to-evict: true
+    spec:
+      containers:
+      - image: {{< example-image >}}
+```
+{{% /feature %}}
 
 #### Allocation Scheduling Strategy
 
-Under the "Packed" strategy, allocation will prioritise allocating `GameServers` to nodes that are running on 
+Under the "Packed" strategy, allocation will prioritise allocating `GameServers` to nodes that are running on
 Nodes that already have allocated `GameServers` running on them.
 
 #### Pod Scheduling Strategy
@@ -113,13 +143,13 @@ with a `preferredDuringSchedulingIgnoredDuringExecution` affinity with [hostname
 topology. This attempts to group together `GameServer` Pods within as few nodes in the cluster as it can.
 
 {{< alert title="Note" color="info">}}
-The default Kubernetes scheduler doesn't do a perfect job of packing, but it's a good enough job for what we need - 
-  at least at this stage. 
+The default Kubernetes scheduler doesn't do a perfect job of packing, but it's a good enough job for what we need -
+  at least at this stage.
 {{< /alert >}}
 
 #### Fleet Scale Down Strategy
 
-With the "Packed" strategy, Fleets will remove `Ready` `GameServers` from Nodes with the _least_ number of `Ready` and 
+With the "Packed" strategy, Fleets will remove `Ready` `GameServers` from Nodes with the _least_ number of `Ready` and
 `Allocated` `GameServers` on them. Attempting to empty Nodes so that they can be safely removed.
 
 ### Distributed