Add allocator readiness configurations (#3142)

Also fix flakes in allocator.TestAllocatorAfterDeleteReplica, hopefully
googleforgames · May 10, 2023 · 5331864 · 5331864
1 parent d7a9e6a
commit 5331864
Show file tree

Hide file tree

Showing 3 changed files with 20 additions and 3 deletions.
diff --git a/build/README.md b/build/README.md
@@ -584,6 +584,9 @@ make test-e2e-integration ARGS='-run TestGameServerReserve'
 #### `make test-e2e-failure`
 Run controller failure portion of the end-to-end tests.
 
+#### `make test-e2e-allocator-crash`
+Run allocator failure portion of the end-to-end test. 
+
 #### `make setup-prometheus`
 
 Install Prometheus server using [Prometheus Community](https://prometheus-community.github.io/helm-charts)

diff --git a/site/content/en/docs/Installation/Install Agones/helm.md b/site/content/en/docs/Installation/Install Agones/helm.md
@@ -266,6 +266,9 @@ The following tables lists the configurable parameters of the Agones chart and t
 | Parameter                             | Description                                                                                                                                                                                                                           | Default |
 |---------------------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|---------|
 | `agones.allocator.labels`                           | [Labels][labels] Added to the Agones Allocator pods                                                                                                                                                           | `{}`                               |
+| `agones.allocator.readiness.initialDelaySeconds`      | Initial delay before performing the first probe (in seconds)                                                                                                                                                            | `3`                                |
+| `agones.allocator.readiness.periodSeconds`            | Seconds between every liveness probe (in seconds)                                                                                                                                                                       | `3`                                |
+| `agones.allocator.readiness.failureThreshold`         | Number of times before giving up (in seconds)                                                                                                                                                                           | `3`                                |
 
 {{% /feature %}}
 

diff --git a/test/e2e/allocator/pod_termination_test.go b/test/e2e/allocator/pod_termination_test.go
@@ -28,6 +28,8 @@ import (
 	"github.com/stretchr/testify/require"
 	v1 "k8s.io/api/core/v1"
 	"k8s.io/apimachinery/pkg/util/wait"
+
+	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 )
 
 const (
@@ -40,13 +42,22 @@ func TestAllocatorAfterDeleteReplica(t *testing.T) {
 
 	var list *v1.PodList
 
+	dep, err := framework.KubeClient.AppsV1().Deployments("agones-system").Get(ctx, "agones-allocator", metav1.GetOptions{})
+	require.NoError(t, err, "Failed to get replicas")
+	replicaCnt := int(*(dep.Spec.Replicas))
+	logrus.Infof("Replica count config is %d", replicaCnt)
+
 	// poll and wait until all allocator pods are running
 	_ = wait.PollImmediate(retryInterval, retryTimeout, func() (done bool, err error) {
 		list, err = helper.GetAgonesAllocatorPods(ctx, framework)
 		if err != nil {
 			return true, err
 		}
 
+		if len(list.Items) != replicaCnt {
+			return false, nil
+		}
+
 		for _, allocpod := range list.Items {
 			podstatus := string(allocpod.Status.Phase)
 			logrus.Infof("Allocator Pod %s, has status of %s", allocpod.ObjectMeta.Name, podstatus)
@@ -58,9 +69,6 @@ func TestAllocatorAfterDeleteReplica(t *testing.T) {
 		return true, nil
 	})
 
-	grpcClient, err := helper.GetAllocatorClient(ctx, t, framework)
-	require.NoError(t, err, "Could not initialize rpc client")
-
 	// create fleet
 	flt, err := helper.CreateFleet(ctx, framework.Namespace, framework)
 	if !assert.Nil(t, err) {
@@ -82,6 +90,9 @@ func TestAllocatorAfterDeleteReplica(t *testing.T) {
 		require.NoError(t, err, "Could not delete allocator pod")
 	}
 
+	grpcClient, err := helper.GetAllocatorClient(ctx, t, framework)
+	require.NoError(t, err, "Could not initialize rpc client")
+
 	// Wait and keep making calls till we know the draining time has passed
 	_ = wait.PollImmediate(retryInterval, retryTimeout, func() (bool, error) {
 		response, err = grpcClient.Allocate(context.Background(), request)