From 00151b3408955ce1386854afadd64255b9b00c31 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christoph=20St=C3=A4bler?= Date: Tue, 28 Apr 2026 14:19:51 +0200 Subject: [PATCH 1/3] Improve e2e test stability in CI - Free ~14GB disk on GitHub runners (up from ~2GB) by removing additional unused pre-installed toolchains - Reduce KinD cluster from 3 to 2 nodes to lower disk and resource pressure - Add kubectl_apply_with_retry with exponential backoff for external URL fetches during cluster setup to survive transient 502 errors - Increase func deploy retries from 3/5s flat to 5/exponential backoff (10s-80s) to handle in-cluster-dialer pod instability --- .github/workflows/test-e2e.yml | 13 ++++++++++-- hack/create-kind-cluster.sh | 39 +++++++++++++++++++++++++--------- test/utils/func.go | 13 ++++++++---- 3 files changed, 49 insertions(+), 16 deletions(-) diff --git a/.github/workflows/test-e2e.yml b/.github/workflows/test-e2e.yml index 8cb1dca..be7a805 100644 --- a/.github/workflows/test-e2e.yml +++ b/.github/workflows/test-e2e.yml @@ -23,8 +23,17 @@ jobs: steps: - name: Free up disk space run: | - # Remove large packages to free up disk space on GitHub runners - sudo rm -rf /usr/share/dotnet /usr/local/lib/android /opt/ghc + # Remove large pre-installed toolchains not needed for K8s operator testing + sudo rm -rf \ + /usr/share/dotnet \ + /usr/local/lib/android \ + /opt/ghc \ + /opt/hostedtoolcache \ + /usr/local/share/powershell \ + /usr/share/swift \ + /usr/local/.ghcup \ + /usr/local/share/chromium \ + /usr/local/lib/heroku # Clean up Docker to start fresh docker system prune -af --volumes df -h diff --git a/hack/create-kind-cluster.sh b/hack/create-kind-cluster.sh index 7dd6575..93b2cda 100755 --- a/hack/create-kind-cluster.sh +++ b/hack/create-kind-cluster.sh @@ -30,6 +30,27 @@ function header_text { echo "$header$*$reset" } +function kubectl_apply_with_retry() { + local max_attempts=5 + local delay=5 + local attempt + + for attempt in $(seq 1 $max_attempts); do + if kubectl apply "$@"; then + return 0 + fi + + if [ "$attempt" -lt "$max_attempts" ]; then + header_text "kubectl apply failed (attempt $attempt/$max_attempts), retrying in ${delay}s..." + sleep "$delay" + delay=$((delay * 2)) + fi + done + + header_text "kubectl apply failed after $max_attempts attempts" + return 1 +} + function delete_existing_cluster() { header_text "Deleting existing Kind cluster..." kind delete cluster --name "$CLUSTER_NAME" || true @@ -84,8 +105,6 @@ nodes: image: kindest/node:$NODE_VERSION - role: worker image: kindest/node:$NODE_VERSION -- role: worker - image: kindest/node:$NODE_VERSION containerdConfigPatches: - |- [plugins."io.containerd.grpc.v1.cri".registry.mirrors."localhost:$REGISTRY_PORT"] @@ -122,7 +141,7 @@ EOF function install_tekton() { header_text "Install Tekton" - kubectl apply -f https://infra.tekton.dev/tekton-releases/pipeline/previous/${TEKTON_VERSION}/release.yaml + kubectl_apply_with_retry -f https://infra.tekton.dev/tekton-releases/pipeline/previous/${TEKTON_VERSION}/release.yaml kubectl patch configmap feature-flags -n tekton-pipelines --type merge -p '{"data":{"coschedule":"disabled"}}' header_text "Waiting for Tekton to be ready..." @@ -132,9 +151,9 @@ function install_tekton() { function install_knative_serving() { header_text "Installing Knative Serving..." - kubectl apply -f https://github.com/knative/serving/releases/download/knative-${SERVING_VERSION}/serving-crds.yaml - kubectl apply -f https://github.com/knative/serving/releases/download/knative-${SERVING_VERSION}/serving-core.yaml - kubectl apply -f https://github.com/knative/net-kourier/releases/download/knative-${SERVING_VERSION}/kourier.yaml + kubectl_apply_with_retry -f https://github.com/knative/serving/releases/download/knative-${SERVING_VERSION}/serving-crds.yaml + kubectl_apply_with_retry -f https://github.com/knative/serving/releases/download/knative-${SERVING_VERSION}/serving-core.yaml + kubectl_apply_with_retry -f https://github.com/knative/net-kourier/releases/download/knative-${SERVING_VERSION}/kourier.yaml kubectl patch configmap/config-network \ --namespace knative-serving \ @@ -148,14 +167,14 @@ function install_knative_serving() { function install_keda() { header_text "Installing keda" - kubectl apply --server-side -f https://github.com/kedacore/keda/releases/download/${KEDA_VERSION}/keda-${KEDA_VERSION:1}.yaml - kubectl apply --server-side -f https://github.com/kedacore/keda/releases/download/${KEDA_VERSION}/keda-${KEDA_VERSION:1}-core.yaml + kubectl_apply_with_retry --server-side -f https://github.com/kedacore/keda/releases/download/${KEDA_VERSION}/keda-${KEDA_VERSION:1}.yaml + kubectl_apply_with_retry --server-side -f https://github.com/kedacore/keda/releases/download/${KEDA_VERSION}/keda-${KEDA_VERSION:1}-core.yaml header_text "Waiting for Keda to become ready" kubectl wait deployment --all --timeout=-1s --for=condition=Available --namespace keda header_text "Installing keda HTTP add-on" - kubectl apply --server-side -f https://github.com/kedacore/http-add-on/releases/download/${KEDA_HTTP_ADDON_VERSION}/keda-add-ons-http-${KEDA_HTTP_ADDON_VERSION:1}-crds.yaml - kubectl apply --server-side -f https://github.com/kedacore/http-add-on/releases/download/${KEDA_HTTP_ADDON_VERSION}/keda-add-ons-http-${KEDA_HTTP_ADDON_VERSION:1}.yaml + kubectl_apply_with_retry --server-side -f https://github.com/kedacore/http-add-on/releases/download/${KEDA_HTTP_ADDON_VERSION}/keda-add-ons-http-${KEDA_HTTP_ADDON_VERSION:1}-crds.yaml + kubectl_apply_with_retry --server-side -f https://github.com/kedacore/http-add-on/releases/download/${KEDA_HTTP_ADDON_VERSION}/keda-add-ons-http-${KEDA_HTTP_ADDON_VERSION:1}.yaml header_text "Waiting for Keda HTTP add-on to become ready" kubectl wait deployment --all --timeout=-1s --for=condition=Available --namespace keda } diff --git a/test/utils/func.go b/test/utils/func.go index 640ef50..85fab54 100644 --- a/test/utils/func.go +++ b/test/utils/func.go @@ -84,11 +84,16 @@ func RunFuncDeploy(functionDir string, optFns ...FuncDeployOption) (string, erro var output string var err error - // Retry up to 3 times with 5s delay between attempts - for attempt := 0; attempt < 3; attempt++ { + maxAttempts := 5 + retryDelay := 10 * time.Second + + for attempt := 0; attempt < maxAttempts; attempt++ { if attempt > 0 { - time.Sleep(5 * time.Second) - _, _ = fmt.Fprintf(ginkgo.GinkgoWriter, "func deploy attempt %d failed: %v (retrying)\n", attempt, err) + _, _ = fmt.Fprintf(ginkgo.GinkgoWriter, + "func deploy attempt %d/%d failed: %v (retrying in %s)\n", + attempt, maxAttempts, err, retryDelay) + time.Sleep(retryDelay) + retryDelay *= 2 } if opts.CliVersion != "" { From 1d042100f40acb2fb3af51bb1f37512e9ba9f6be Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christoph=20St=C3=A4bler?= Date: Tue, 28 Apr 2026 15:43:03 +0200 Subject: [PATCH 2/3] Isolate pack builder PACK_HOME for parallel test execution Parallel func deploy calls with the pack builder corrupt the shared ~/.pack/volume-keys.toml file, causing deterministic failures. Create a per-deploy temporary PACK_HOME directory when using the pack builder via the new EnvVars deploy option. --- test/utils/func.go | 36 ++++++++++++++++++++++++++++++++++-- test/utils/utils.go | 2 +- 2 files changed, 35 insertions(+), 3 deletions(-) diff --git a/test/utils/func.go b/test/utils/func.go index 85fab54..be2e95e 100644 --- a/test/utils/func.go +++ b/test/utils/func.go @@ -81,6 +81,21 @@ func RunFuncDeploy(functionDir string, optFns ...FuncDeployOption) (string, erro args = append(args, "--deployer", opts.Deployer) } + // When using the pack builder, create a per-deploy PACK_HOME to prevent + // parallel builds from corrupting the shared ~/.pack/volume-keys.toml. + if opts.Builder == "pack" { + packHome, err := os.MkdirTemp("", "pack-home-*") + if err != nil { + return "", fmt.Errorf("failed to create PACK_HOME: %w", err) + } + defer os.RemoveAll(packHome) + + if opts.EnvVars == nil { + opts.EnvVars = make(map[string]string) + } + opts.EnvVars["PACK_HOME"] = packHome + } + var output string var err error @@ -96,12 +111,22 @@ func RunFuncDeploy(functionDir string, optFns ...FuncDeployOption) (string, erro retryDelay *= 2 } + var funcBinary string if opts.CliVersion != "" { - output, err = RunFuncWithVersion(opts.CliVersion, "deploy", args...) + funcBinary, err = ensureFuncVersion(opts.CliVersion) + if err != nil { + return "", err + } } else { - output, err = RunFunc("deploy", args...) + funcBinary = "func" } + cmd := exec.Command(funcBinary, append([]string{"deploy"}, args...)...) + for k, v := range opts.EnvVars { + cmd.Env = append(cmd.Env, fmt.Sprintf("%s=%s", k, v)) + } + output, err = Run(cmd) + if err == nil { return output, nil } @@ -117,6 +142,7 @@ type FuncDeployOptions struct { Builder string Deployer string CliVersion string + EnvVars map[string]string } type FuncDeployOption func(*FuncDeployOptions) @@ -145,6 +171,12 @@ func WithDeployCliVersion(version string) FuncDeployOption { } } +func WithEnvVars(envVars map[string]string) FuncDeployOption { + return func(opts *FuncDeployOptions) { + opts.EnvVars = envVars + } +} + // ensureFuncVersion ensures the specified func version is available and returns its path func ensureFuncVersion(version string) (string, error) { projectDir, err := GetProjectDir() diff --git a/test/utils/utils.go b/test/utils/utils.go index 3d27388..40dadd0 100644 --- a/test/utils/utils.go +++ b/test/utils/utils.go @@ -36,7 +36,7 @@ func Run(cmd *exec.Cmd) (string, error) { _, _ = fmt.Fprintf(GinkgoWriter, "chdir dir: %q\n", err) } - cmd.Env = append(os.Environ(), "GO111MODULE=on") + cmd.Env = append(append(os.Environ(), cmd.Env...), "GO111MODULE=on") command := strings.Join(cmd.Args, " ") _, _ = fmt.Fprintf(GinkgoWriter, "running: %q\n", command) output, err := cmd.CombinedOutput() From be6f338d38c3e51edb0b40dfb10269bc52692645 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christoph=20St=C3=A4bler?= Date: Tue, 28 Apr 2026 17:13:47 +0200 Subject: [PATCH 3/3] Allow pre-pull of builder images to fail gracefully The pre-pull step is an optimization to avoid parallel pull contention during tests. If it fails due to transient network issues, the images will be pulled on-demand during the test run instead. --- .github/workflows/test-e2e.yml | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/.github/workflows/test-e2e.yml b/.github/workflows/test-e2e.yml index be7a805..497dd86 100644 --- a/.github/workflows/test-e2e.yml +++ b/.github/workflows/test-e2e.yml @@ -79,17 +79,17 @@ jobs: if [ "${{ matrix.builder }}" = "s2i" ]; then echo "Pre-pulling S2I builder images..." - docker pull registry.access.redhat.com/ubi8/go-toolset:latest + docker pull registry.access.redhat.com/ubi8/go-toolset:latest || true # for now we only download the go image, as we only use this in the e2e tests - - kind load docker-image registry.access.redhat.com/ubi8/go-toolset:latest + + kind load docker-image registry.access.redhat.com/ubi8/go-toolset:latest || true elif [ "${{ matrix.builder }}" = "pack" ]; then echo "Pre-pulling pack builder images..." - docker pull ghcr.io/knative/builder-jammy-base:v2 - docker pull ghcr.io/knative/builder-jammy-tiny:v2 + docker pull ghcr.io/knative/builder-jammy-base:v2 || true + docker pull ghcr.io/knative/builder-jammy-tiny:v2 || true - kind load docker-image ghcr.io/knative/builder-jammy-base:v2 - kind load docker-image ghcr.io/knative/builder-jammy-tiny:v2 + kind load docker-image ghcr.io/knative/builder-jammy-base:v2 || true + kind load docker-image ghcr.io/knative/builder-jammy-tiny:v2 || true fi echo "Builder images cached in KinD cluster"