From b7c5e8d8eeeadc5a8370ebb79ecc2abd4ff7c8dc Mon Sep 17 00:00:00 2001 From: Jaime Soriano Pastor Date: Thu, 26 Jan 2023 18:38:30 +0100 Subject: [PATCH 01/14] Update default stack version to 8.6.1 --- internal/install/stack_version.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/internal/install/stack_version.go b/internal/install/stack_version.go index 1f742edd3c..9e07dcf730 100644 --- a/internal/install/stack_version.go +++ b/internal/install/stack_version.go @@ -6,5 +6,5 @@ package install const ( // DefaultStackVersion is the default version of the stack - DefaultStackVersion = "8.5.1" + DefaultStackVersion = "8.6.1" ) From bcb5cbfa8a05c73057ab3f0e8d859503fb6c3a35 Mon Sep 17 00:00:00 2001 From: Jaime Soriano Pastor Date: Fri, 27 Jan 2023 13:31:59 +0100 Subject: [PATCH 02/14] Increase fleet-server and elastic-agent healthcheck retries --- internal/install/_static/docker-custom-agent-base.yml | 2 +- internal/profile/_static/docker-compose-stack.yml | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/internal/install/_static/docker-custom-agent-base.yml b/internal/install/_static/docker-custom-agent-base.yml index 610ef273df..d2d2a80079 100644 --- a/internal/install/_static/docker-custom-agent-base.yml +++ b/internal/install/_static/docker-custom-agent-base.yml @@ -4,7 +4,7 @@ services: image: "${ELASTIC_AGENT_IMAGE_REF}" healthcheck: test: "elastic-agent status" - retries: 180 + retries: 360 interval: 1s hostname: docker-custom-agent environment: diff --git a/internal/profile/_static/docker-compose-stack.yml b/internal/profile/_static/docker-compose-stack.yml index 827e1e62cc..3d3847bf38 100644 --- a/internal/profile/_static/docker-compose-stack.yml +++ b/internal/profile/_static/docker-compose-stack.yml @@ -89,7 +89,7 @@ services: condition: service_healthy healthcheck: test: "curl --cacert /etc/ssl/elastic-agent/ca-cert.pem -f https://localhost:8220/api/status | grep -i healthy 2>&1 >/dev/null" - retries: 60 + retries: 180 interval: 5s hostname: docker-fleet-server environment: @@ -124,7 +124,7 @@ services: condition: service_healthy healthcheck: test: "elastic-agent status" - retries: 180 + retries: 360 interval: 1s hostname: docker-fleet-agent env_file: "./elastic-agent.${STACK_VERSION_VARIANT}.env" From 0116827eb8a38eead3ffd2d0e3fdaeb266454060 Mon Sep 17 00:00:00 2001 From: Jaime Soriano Pastor Date: Fri, 27 Jan 2023 14:07:33 +0100 Subject: [PATCH 03/14] Add start period --- internal/profile/_static/docker-compose-stack.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/internal/profile/_static/docker-compose-stack.yml b/internal/profile/_static/docker-compose-stack.yml index 3d3847bf38..af65623fbc 100644 --- a/internal/profile/_static/docker-compose-stack.yml +++ b/internal/profile/_static/docker-compose-stack.yml @@ -89,6 +89,7 @@ services: condition: service_healthy healthcheck: test: "curl --cacert /etc/ssl/elastic-agent/ca-cert.pem -f https://localhost:8220/api/status | grep -i healthy 2>&1 >/dev/null" + start_period: 60s retries: 180 interval: 5s hostname: docker-fleet-server @@ -124,6 +125,7 @@ services: condition: service_healthy healthcheck: test: "elastic-agent status" + start_period: 60s retries: 360 interval: 1s hostname: docker-fleet-agent From bc6a6b91359c29e77f028fee4802f8a3ac49e711 Mon Sep 17 00:00:00 2001 From: Mario Rodriguez Molins Date: Fri, 27 Jan 2023 17:12:40 +0100 Subject: [PATCH 04/14] Remove release experimental in packages used in build-zip --- test/packages/benchmarks/pipeline_benchmark/manifest.yml | 1 - test/packages/benchmarks/use_pipeline_tests/manifest.yml | 1 - test/packages/other/multiinput/manifest.yml | 1 - test/packages/other/pipeline_tests/manifest.yml | 1 - 4 files changed, 4 deletions(-) diff --git a/test/packages/benchmarks/pipeline_benchmark/manifest.yml b/test/packages/benchmarks/pipeline_benchmark/manifest.yml index f7713fd52c..bce8399a28 100644 --- a/test/packages/benchmarks/pipeline_benchmark/manifest.yml +++ b/test/packages/benchmarks/pipeline_benchmark/manifest.yml @@ -7,7 +7,6 @@ title: Pipeline benchmarks version: 999.999.999 description: Test for pipeline test runner categories: ["network"] -release: experimental license: basic type: integration conditions: diff --git a/test/packages/benchmarks/use_pipeline_tests/manifest.yml b/test/packages/benchmarks/use_pipeline_tests/manifest.yml index b30a02942b..f145d71818 100644 --- a/test/packages/benchmarks/use_pipeline_tests/manifest.yml +++ b/test/packages/benchmarks/use_pipeline_tests/manifest.yml @@ -7,7 +7,6 @@ title: Use pipeline tests for the benchmark version: 999.999.999 description: Test for pipeline test runner categories: ["network"] -release: experimental license: basic type: integration conditions: diff --git a/test/packages/other/multiinput/manifest.yml b/test/packages/other/multiinput/manifest.yml index 7eb589115b..7ffaf75b38 100644 --- a/test/packages/other/multiinput/manifest.yml +++ b/test/packages/other/multiinput/manifest.yml @@ -7,7 +7,6 @@ title: Multi-input test version: 999.999.999 description: Test for multiple input tests categories: ["network"] -release: experimental license: basic type: integration conditions: diff --git a/test/packages/other/pipeline_tests/manifest.yml b/test/packages/other/pipeline_tests/manifest.yml index 042c0d21ff..96df3f5237 100644 --- a/test/packages/other/pipeline_tests/manifest.yml +++ b/test/packages/other/pipeline_tests/manifest.yml @@ -7,7 +7,6 @@ title: Pipeline tests version: 999.999.999 description: Test for pipeline test runner categories: ["network"] -release: experimental license: basic type: integration conditions: From 8f1b3d0c4db8620adc07f3aaaadb9b0998fa4940 Mon Sep 17 00:00:00 2001 From: Jaime Soriano Pastor Date: Fri, 27 Jan 2023 19:18:50 +0100 Subject: [PATCH 05/14] Add restart policy for elastic-agent --- internal/profile/_static/docker-compose-stack.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/internal/profile/_static/docker-compose-stack.yml b/internal/profile/_static/docker-compose-stack.yml index af65623fbc..95bba7c1ae 100644 --- a/internal/profile/_static/docker-compose-stack.yml +++ b/internal/profile/_static/docker-compose-stack.yml @@ -128,6 +128,8 @@ services: start_period: 60s retries: 360 interval: 1s + # Handle failures caused by fleet-server being restarted or reconfigured after the health-check passed. + restart: on-failure:5 hostname: docker-fleet-agent env_file: "./elastic-agent.${STACK_VERSION_VARIANT}.env" volumes: From 3dde7d633e33fe79f3b05ebb46da56947fd161ae Mon Sep 17 00:00:00 2001 From: Jaime Soriano Pastor Date: Fri, 27 Jan 2023 20:54:12 +0100 Subject: [PATCH 06/14] Use pause image instead of true --- internal/profile/_static/docker-compose-stack.yml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/internal/profile/_static/docker-compose-stack.yml b/internal/profile/_static/docker-compose-stack.yml index 95bba7c1ae..eb72675a99 100644 --- a/internal/profile/_static/docker-compose-stack.yml +++ b/internal/profile/_static/docker-compose-stack.yml @@ -18,7 +18,7 @@ services: - "127.0.0.1:9200:9200" elasticsearch_is_ready: - image: tianon/true + image: k8s.gcr.io/pause:3.3 depends_on: elasticsearch: condition: service_healthy @@ -47,7 +47,7 @@ services: - "127.0.0.1:5601:5601" kibana_is_ready: - image: tianon/true + image: k8s.gcr.io/pause:3.3 depends_on: kibana: condition: service_healthy @@ -75,7 +75,7 @@ services: - "127.0.0.1:9000:9000" package-registry_is_ready: - image: tianon/true + image: k8s.gcr.io/pause:3.3 depends_on: package-registry: condition: service_healthy @@ -113,7 +113,7 @@ services: - "127.0.0.1:8220:8220" fleet-server_is_ready: - image: tianon/true + image: k8s.gcr.io/pause:3.3 depends_on: fleet-server: condition: service_healthy @@ -139,7 +139,7 @@ services: target: /tmp/service_logs/ elastic-agent_is_ready: - image: tianon/true + image: k8s.gcr.io/pause:3.3 depends_on: elastic-agent: condition: service_healthy From b1a43817cc865ec719cc01948922c7d4820ad83d Mon Sep 17 00:00:00 2001 From: Jaime Soriano Pastor Date: Sat, 28 Jan 2023 13:37:12 +0100 Subject: [PATCH 07/14] Try only with restarts --- .../install/_static/docker-custom-agent-base.yml | 2 +- .../profile/_static/docker-compose-stack.yml | 16 +++++++--------- 2 files changed, 8 insertions(+), 10 deletions(-) diff --git a/internal/install/_static/docker-custom-agent-base.yml b/internal/install/_static/docker-custom-agent-base.yml index d2d2a80079..610ef273df 100644 --- a/internal/install/_static/docker-custom-agent-base.yml +++ b/internal/install/_static/docker-custom-agent-base.yml @@ -4,7 +4,7 @@ services: image: "${ELASTIC_AGENT_IMAGE_REF}" healthcheck: test: "elastic-agent status" - retries: 360 + retries: 180 interval: 1s hostname: docker-custom-agent environment: diff --git a/internal/profile/_static/docker-compose-stack.yml b/internal/profile/_static/docker-compose-stack.yml index eb72675a99..6921ade432 100644 --- a/internal/profile/_static/docker-compose-stack.yml +++ b/internal/profile/_static/docker-compose-stack.yml @@ -18,7 +18,7 @@ services: - "127.0.0.1:9200:9200" elasticsearch_is_ready: - image: k8s.gcr.io/pause:3.3 + image: tianon/true depends_on: elasticsearch: condition: service_healthy @@ -47,7 +47,7 @@ services: - "127.0.0.1:5601:5601" kibana_is_ready: - image: k8s.gcr.io/pause:3.3 + image: tianon/true depends_on: kibana: condition: service_healthy @@ -75,7 +75,7 @@ services: - "127.0.0.1:9000:9000" package-registry_is_ready: - image: k8s.gcr.io/pause:3.3 + image: tianon/true depends_on: package-registry: condition: service_healthy @@ -89,8 +89,7 @@ services: condition: service_healthy healthcheck: test: "curl --cacert /etc/ssl/elastic-agent/ca-cert.pem -f https://localhost:8220/api/status | grep -i healthy 2>&1 >/dev/null" - start_period: 60s - retries: 180 + retries: 60 interval: 5s hostname: docker-fleet-server environment: @@ -113,7 +112,7 @@ services: - "127.0.0.1:8220:8220" fleet-server_is_ready: - image: k8s.gcr.io/pause:3.3 + image: tianon/true depends_on: fleet-server: condition: service_healthy @@ -125,8 +124,7 @@ services: condition: service_healthy healthcheck: test: "elastic-agent status" - start_period: 60s - retries: 360 + retries: 180 interval: 1s # Handle failures caused by fleet-server being restarted or reconfigured after the health-check passed. restart: on-failure:5 @@ -139,7 +137,7 @@ services: target: /tmp/service_logs/ elastic-agent_is_ready: - image: k8s.gcr.io/pause:3.3 + image: tianon/true depends_on: elastic-agent: condition: service_healthy From 57a0e4b6cbb386685d79978b7fd80e860cd8429f Mon Sep 17 00:00:00 2001 From: Jaime Soriano Pastor Date: Sat, 28 Jan 2023 13:56:04 +0100 Subject: [PATCH 08/14] Use start period instead of retries --- .../profile/_static/docker-compose-stack.yml | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/internal/profile/_static/docker-compose-stack.yml b/internal/profile/_static/docker-compose-stack.yml index 6921ade432..146f00455e 100644 --- a/internal/profile/_static/docker-compose-stack.yml +++ b/internal/profile/_static/docker-compose-stack.yml @@ -4,8 +4,8 @@ services: image: "${ELASTICSEARCH_IMAGE_REF}" healthcheck: test: "curl -s --cacert /usr/share/elasticsearch/config/certs/ca-cert.pem -f -u elastic:changeme https://127.0.0.1:9200/_cat/health | cut -f4 -d' ' | grep -E '(green|yellow)'" - retries: 300 - interval: 1s + start_period: 300s + interval: 5s environment: - "ES_JAVA_OPTS=-Xms1g -Xmx1g" - "ELASTIC_PASSWORD=changeme" @@ -32,8 +32,8 @@ services: condition: service_healthy healthcheck: test: "sh /usr/share/kibana/healthcheck.sh" - retries: 600 - interval: 1s + start_period: 600s + interval: 5s env_file: - "../certs/ca.env" environment: @@ -60,8 +60,8 @@ services: PROFILE: "${PROFILE_NAME}" healthcheck: test: ["CMD", "curl", "--cacert", "/etc/ssl/package-registry/ca-cert.pem", "-f", "https://localhost:8080"] - retries: 300 - interval: 1s + start_period: 300s + interval: 5s environment: - "EPR_LOG_LEVEL=debug" - "EPR_ADDRESS=0.0.0.0:8080" @@ -89,7 +89,7 @@ services: condition: service_healthy healthcheck: test: "curl --cacert /etc/ssl/elastic-agent/ca-cert.pem -f https://localhost:8220/api/status | grep -i healthy 2>&1 >/dev/null" - retries: 60 + start_period: 60s interval: 5s hostname: docker-fleet-server environment: @@ -124,8 +124,8 @@ services: condition: service_healthy healthcheck: test: "elastic-agent status" - retries: 180 - interval: 1s + start_period: 180s + interval: 5s # Handle failures caused by fleet-server being restarted or reconfigured after the health-check passed. restart: on-failure:5 hostname: docker-fleet-agent From e8017b200632b386de9e653fada4e3804f74d776 Mon Sep 17 00:00:00 2001 From: Jaime Soriano Pastor Date: Sat, 28 Jan 2023 14:26:25 +0100 Subject: [PATCH 09/14] Increase elastic-agent start period --- internal/profile/_static/docker-compose-stack.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/internal/profile/_static/docker-compose-stack.yml b/internal/profile/_static/docker-compose-stack.yml index 146f00455e..c0d7059fb2 100644 --- a/internal/profile/_static/docker-compose-stack.yml +++ b/internal/profile/_static/docker-compose-stack.yml @@ -124,7 +124,7 @@ services: condition: service_healthy healthcheck: test: "elastic-agent status" - start_period: 180s + start_period: 360s interval: 5s # Handle failures caused by fleet-server being restarted or reconfigured after the health-check passed. restart: on-failure:5 From 514864445be638f722140842337e8263be0af292 Mon Sep 17 00:00:00 2001 From: Jaime Soriano Pastor Date: Sat, 28 Jan 2023 18:03:34 +0100 Subject: [PATCH 10/14] Increase retries for elastic-agent healthchecks --- internal/profile/_static/docker-compose-stack.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/internal/profile/_static/docker-compose-stack.yml b/internal/profile/_static/docker-compose-stack.yml index c0d7059fb2..8e064d055a 100644 --- a/internal/profile/_static/docker-compose-stack.yml +++ b/internal/profile/_static/docker-compose-stack.yml @@ -125,6 +125,7 @@ services: healthcheck: test: "elastic-agent status" start_period: 360s + retries: 20 interval: 5s # Handle failures caused by fleet-server being restarted or reconfigured after the health-check passed. restart: on-failure:5 From 2de9427506293e06d3db597ab7a08b13817879c8 Mon Sep 17 00:00:00 2001 From: Jaime Soriano Pastor Date: Sat, 28 Jan 2023 20:01:04 +0100 Subject: [PATCH 11/14] Increase number of retries --- internal/profile/_static/docker-compose-stack.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/internal/profile/_static/docker-compose-stack.yml b/internal/profile/_static/docker-compose-stack.yml index 8e064d055a..f45fa8647b 100644 --- a/internal/profile/_static/docker-compose-stack.yml +++ b/internal/profile/_static/docker-compose-stack.yml @@ -124,8 +124,9 @@ services: condition: service_healthy healthcheck: test: "elastic-agent status" + timeout: 2s start_period: 360s - retries: 20 + retries: 180 interval: 5s # Handle failures caused by fleet-server being restarted or reconfigured after the health-check passed. restart: on-failure:5 From 86f11a2bf147ce9c4fdc8a370ae24006af93bf08 Mon Sep 17 00:00:00 2001 From: Jaime Soriano Pastor Date: Sun, 29 Jan 2023 13:58:28 +0100 Subject: [PATCH 12/14] Retry to boot the stack up if only elastic-agent failed to start --- .../profile/_static/docker-compose-stack.yml | 2 -- internal/stack/boot.go | 27 +++++++++++++++++++ 2 files changed, 27 insertions(+), 2 deletions(-) diff --git a/internal/profile/_static/docker-compose-stack.yml b/internal/profile/_static/docker-compose-stack.yml index f45fa8647b..0ad5b8b3e0 100644 --- a/internal/profile/_static/docker-compose-stack.yml +++ b/internal/profile/_static/docker-compose-stack.yml @@ -128,8 +128,6 @@ services: start_period: 360s retries: 180 interval: 5s - # Handle failures caused by fleet-server being restarted or reconfigured after the health-check passed. - restart: on-failure:5 hostname: docker-fleet-agent env_file: "./elastic-agent.${STACK_VERSION_VARIANT}.env" volumes: diff --git a/internal/stack/boot.go b/internal/stack/boot.go index 61b1426bb3..823b92cd25 100644 --- a/internal/stack/boot.go +++ b/internal/stack/boot.go @@ -60,12 +60,39 @@ func BootUp(options Options) error { err = dockerComposeUp(options) if err != nil { + // At least starting on 8.6.0, fleet-server may be reconfigured or + // restarted after being healthy. If elastic-agent tries to enroll at + // this moment, it fails inmediately and makes `docker-compose up` to fail too. + // As workaround, try to give another chance to docker-compose if only + // elastic-agent failed. + if onlyElasticAgentFailed() { + err = dockerComposeUp(options) + } return errors.Wrap(err, "running docker-compose failed") } return nil } +func onlyElasticAgentFailed() bool { + status, err := Status() + if err != nil { + fmt.Println("Failed to check status of the stack after failure: %s", err) + return false + } + + for _, service := range status { + if strings.Contains(service.Name, "elastic-agent") { + continue + } + if !strings.HasPrefix(service.Status, "running") { + return false + } + } + + return true +} + // TearDown function takes down the testing stack. func TearDown(options Options) error { err := dockerComposeDown(options) From 790c8ef6c7f6d404aa539fe13e51503834c3d9df Mon Sep 17 00:00:00 2001 From: Jaime Soriano Pastor Date: Sun, 29 Jan 2023 14:00:14 +0100 Subject: [PATCH 13/14] Add info message --- internal/stack/boot.go | 1 + 1 file changed, 1 insertion(+) diff --git a/internal/stack/boot.go b/internal/stack/boot.go index 823b92cd25..a1cce8b8aa 100644 --- a/internal/stack/boot.go +++ b/internal/stack/boot.go @@ -66,6 +66,7 @@ func BootUp(options Options) error { // As workaround, try to give another chance to docker-compose if only // elastic-agent failed. if onlyElasticAgentFailed() { + fmt.Println("Elastic Agent failed to start, trying again.") err = dockerComposeUp(options) } return errors.Wrap(err, "running docker-compose failed") From 5540965e49eea63e047dc6439a0bc3bf32d77766 Mon Sep 17 00:00:00 2001 From: Jaime Soriano Pastor Date: Sun, 29 Jan 2023 19:58:02 +0100 Subject: [PATCH 14/14] Fix print --- internal/stack/boot.go | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/internal/stack/boot.go b/internal/stack/boot.go index a1cce8b8aa..b2262a7ed7 100644 --- a/internal/stack/boot.go +++ b/internal/stack/boot.go @@ -62,8 +62,9 @@ func BootUp(options Options) error { if err != nil { // At least starting on 8.6.0, fleet-server may be reconfigured or // restarted after being healthy. If elastic-agent tries to enroll at - // this moment, it fails inmediately and makes `docker-compose up` to fail too. - // As workaround, try to give another chance to docker-compose if only + // this moment, it fails inmediately, stopping and making `docker-compose up` + // to fail too. + // As a workaround, try to give another chance to docker-compose if only // elastic-agent failed. if onlyElasticAgentFailed() { fmt.Println("Elastic Agent failed to start, trying again.") @@ -78,7 +79,7 @@ func BootUp(options Options) error { func onlyElasticAgentFailed() bool { status, err := Status() if err != nil { - fmt.Println("Failed to check status of the stack after failure: %s", err) + fmt.Printf("Failed to check status of the stack after failure: %v\n", err) return false }