Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix ingester StatefulSet reconciliation if ingester is in an unhealthy state #597

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
16 changes: 16 additions & 0 deletions .chloggen/always-reconcile-statefulset.yaml
@@ -0,0 +1,16 @@
# One of 'breaking', 'deprecation', 'new_component', 'enhancement', 'bug_fix'
change_type: bug_fix

# The name of the component, or a single word describing the area of concern, (e.g. operator, github action)
component: operator

# A brief description of the change. Surround your text with quotes ("") if it needs to start with a backtick (`).
note: Fix ingester StatefulSet reconciliation if ingester is in an unhealthy state

# One or more tracking issues related to the change
issues: [597]

# (Optional) One or more lines of additional information to render under the primary note.
# These lines will be padded with 2 spaces and then inserted directly into the document.
# Use pipe (|) for multiline entries.
subtext:
4 changes: 0 additions & 4 deletions .github/workflows/e2e.yaml
Expand Up @@ -111,7 +111,3 @@ jobs:

- name: Run tests
run: make e2e-upgrade
env:
UPGRADE_FROM_CATALOG: quay.io/operatorhubio/catalog:latest
UPGRADE_TO_CATALOG: localregistry:5000/tempo-operator-catalog:v100.0.0
UPGRADE_TO_VERSION: 100.0.0
8 changes: 8 additions & 0 deletions internal/manifests/ingester/ingester.go
Expand Up @@ -62,6 +62,14 @@ func statefulSet(params manifestutils.Params) (*v1.StatefulSet, error) {
Selector: &metav1.LabelSelector{
MatchLabels: labels,
},

// Changes to a StatefulSet are not propagated to pods in a broken state (e.g. CrashLoopBackOff)
// See https://github.com/kubernetes/kubernetes/issues/67250
//
// This is a workaround for the above issue.
// This setting is also in the tempo-distributed helm chart: https://github.com/grafana/helm-charts/blob/0fdf2e1900733eb104ac734f5fb0a89dc950d2c2/charts/tempo-distributed/templates/ingester/statefulset-ingester.yaml#L21
PodManagementPolicy: v1.ParallelPodManagement,

Template: corev1.PodTemplateSpec{
ObjectMeta: metav1.ObjectMeta{
Labels: k8slabels.Merge(labels, memberlist.GossipSelector),
Expand Down
1 change: 1 addition & 0 deletions internal/manifests/ingester/ingester_test.go
Expand Up @@ -73,6 +73,7 @@ func TestBuildIngester(t *testing.T) {
Selector: &metav1.LabelSelector{
MatchLabels: labels,
},
PodManagementPolicy: v1.ParallelPodManagement,
Template: corev1.PodTemplateSpec{

ObjectMeta: metav1.ObjectMeta{
Expand Down
1 change: 1 addition & 0 deletions internal/upgrade/v0_3_0.go
Expand Up @@ -6,6 +6,7 @@ import (
"github.com/grafana/tempo-operator/apis/tempo/v1alpha1"
)

// This upgrade sets the deprecated MaxSearchBytesPerTrace field to nil.
func upgrade0_3_0(ctx context.Context, u Upgrade, tempo *v1alpha1.TempoStack) (*v1alpha1.TempoStack, error) {
if tempo.Spec.LimitSpec.Global.Query.MaxSearchBytesPerTrace != nil {
tempo.Spec.LimitSpec.Global.Query.MaxSearchBytesPerTrace = nil
Expand Down
36 changes: 36 additions & 0 deletions internal/upgrade/v0_5_0.go
@@ -0,0 +1,36 @@
package upgrade

import (
"context"
"fmt"

v1 "k8s.io/api/apps/v1"
"sigs.k8s.io/controller-runtime/pkg/client"

"github.com/grafana/tempo-operator/apis/tempo/v1alpha1"
"github.com/grafana/tempo-operator/internal/manifests/manifestutils"
)

// This upgrade modifies the immutable field PodManagementPolicy of the ingester StatefulSet,
// therefore we delete the ingester StatefulSet, which will be recreated in the reconcile loop.
func upgrade0_5_0(ctx context.Context, u Upgrade, tempo *v1alpha1.TempoStack) (*v1alpha1.TempoStack, error) {
listOps := []client.ListOption{
client.MatchingLabels(manifestutils.ComponentLabels(manifestutils.IngesterComponentName, tempo.Name)),
}
ingesterList := &v1.StatefulSetList{}
err := u.Client.List(ctx, ingesterList, listOps...)
if err != nil {
return tempo, fmt.Errorf("failed to list ingester stateful sets: %w", err)
}

for _, ingester := range ingesterList.Items {
ingester := ingester
u.Log.Info("deleting ingester (will be re-created)", "ingester", ingester.Name)
err := u.Client.Delete(ctx, &ingester)
if err != nil {
return tempo, fmt.Errorf("failed to delete ingester %s: %w", ingester.Name, err)
}
}

return tempo, nil
}
4 changes: 4 additions & 0 deletions internal/upgrade/versions.go
Expand Up @@ -27,5 +27,9 @@ var (
version: *semver.MustParse("0.3.0"),
upgrade: upgrade0_3_0,
},
{
version: *semver.MustParse("0.5.0"),
upgrade: upgrade0_5_0,
},
}
)
5 changes: 5 additions & 0 deletions tests/e2e-upgrade/upgrade/03-assert.yaml
Expand Up @@ -32,3 +32,8 @@ metadata:
name: tempo-simplest-ingester
status:
readyReplicas: 1
---
apiVersion: kuttl.dev/v1beta1
kind: TestAssert
commands:
- command: /bin/sh -c "kubectl get --namespace $NAMESPACE tempo simplest -o jsonpath='{.status.conditions[?(@.type==\"Ready\")].status}' | grep True"
19 changes: 11 additions & 8 deletions tests/e2e-upgrade/upgrade/07-assert.yaml
@@ -1,8 +1,11 @@
apiVersion: batch/v1
kind: Job
metadata:
name: verify-traces-after-upgrade
status:
conditions:
- status: "True"
type: Complete
apiVersion: kuttl.dev/v1beta1
kind: TestAssert
commands:
# Assert TempoStack Ready condition after the CR got upgraded.
# Do not move this to the previous assert step, as in the previous assert step
# the status condition might still be true, if the operator wasn't upgraded yet.
- command: /bin/sh -c "kubectl get --namespace $NAMESPACE tempo simplest -o jsonpath='{.status.conditions[?(@.type==\"Ready\")].status}' | grep True"

# debugging output to verify updated manifests
- command: kubectl get --namespace $NAMESPACE deployment -o yaml
- command: kubectl get --namespace $NAMESPACE statefulset -o yaml
8 changes: 8 additions & 0 deletions tests/e2e-upgrade/upgrade/08-assert.yaml
@@ -0,0 +1,8 @@
apiVersion: batch/v1
kind: Job
metadata:
name: verify-traces-after-upgrade
status:
conditions:
- status: "True"
type: Complete
5 changes: 5 additions & 0 deletions tests/e2e/smoketest-with-jaeger/01-assert.yaml
Expand Up @@ -365,3 +365,8 @@ spec:
name: tempo-simplest-query-frontend
port:
name: jaeger-ui
---
apiVersion: kuttl.dev/v1beta1
kind: TestAssert
commands:
- command: /bin/sh -c "kubectl get --namespace $NAMESPACE tempo simplest -o jsonpath='{.status.conditions[?(@.type==\"Ready\")].status}' | grep True"