From 54677dd245577d01334b7556bbe2816dbcd8487c Mon Sep 17 00:00:00 2001 From: Bernhard Mallinger Date: Mon, 10 Feb 2025 09:52:41 +0100 Subject: [PATCH] Improve probe setup Liveness probe is now a TCP probe. This has the effect that if the server is overloaded and does not respond to HTTP requests in time that it will not be killed as long as the server still listens on the TCP port (i.e. the server is actually alive). Therefore it can finish the requests (this shouldn't take forever due to gunicorn request timeouts). The readiness probe now is an HTTP probe, so if the server is overloaded, it won't receive any new requests. Instead of initialDelaySeconds, a startup HTTP probe is used which queries the service every second, so that it's marked as available as soon as it's actually available. This is vital for one of our use cases where we scale the service to zero and only activate it on the first request, i.e. users wait for the service to actually become available. --- .../eoapi/templates/services/deployment.yaml | 20 ++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/helm-chart/eoapi/templates/services/deployment.yaml b/helm-chart/eoapi/templates/services/deployment.yaml index 8c38fbe4..67d33ea6 100644 --- a/helm-chart/eoapi/templates/services/deployment.yaml +++ b/helm-chart/eoapi/templates/services/deployment.yaml @@ -40,6 +40,13 @@ spec: - "--root-path=/{{ $serviceName }}{{ $.Release.Name }}" {{- end }}{{/* needed for proxies and path rewrites on NLB */}} livenessProbe: + tcpSocket: + port: {{ $.Values.service.port }} + failureThreshold: 3 + periodSeconds: 15 + successThreshold: 1 + timeoutSeconds: 1 + readinessProbe: httpGet: {{- if (eq $serviceName "stac") }} path: /_mgmt/ping @@ -48,9 +55,20 @@ spec: {{- end }} port: {{ $.Values.service.port }} failureThreshold: 3 - initialDelaySeconds: 30 periodSeconds: 15 successThreshold: 1 + startupProbe: + httpGet: + {{- if (eq $serviceName "stac") }} + path: /_mgmt/ping + {{- else }} + path: /healthz + {{- end }} + port: {{ $.Values.service.port }} + # check every sec for 1 minute + periodSeconds: 1 + failureThreshold: 60 + successThreshold: 1 ports: - containerPort: {{ $.Values.service.port }} resources: