Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: analytics webhook retries #1520

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions artifacts/flagger/crd.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1086,6 +1086,9 @@ spec:
type: object
additionalProperties:
type: string
retries:
description: Number of retries before incrementing canary failure count
type: number
sessionAffinity:
description: SessionAffinity represents the session affinity settings for a canary run.
type: object
Expand Down Expand Up @@ -1132,6 +1135,14 @@ spec:
additionalProperties:
type: string
type: object
webhooks:
description: Webhook status of this canary
type: object
additionalProperties:
type: object
properties:
retries:
type: number
lastAppliedSpec:
description: LastAppliedSpec of this canary
type: string
Expand Down
11 changes: 11 additions & 0 deletions charts/flagger/crds/crd.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1086,6 +1086,9 @@ spec:
type: object
additionalProperties:
type: string
retries:
description: Number of retries before incrementing canary failure count
type: number
sessionAffinity:
description: SessionAffinity represents the session affinity settings for a canary run.
type: object
Expand Down Expand Up @@ -1132,6 +1135,14 @@ spec:
additionalProperties:
type: string
type: object
webhooks:
description: Webhook status of this canary
type: object
additionalProperties:
type: object
properties:
retries:
type: number
lastAppliedSpec:
description: LastAppliedSpec of this canary
type: string
Expand Down
11 changes: 11 additions & 0 deletions kustomize/base/flagger/crd.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1086,6 +1086,9 @@ spec:
type: object
additionalProperties:
type: string
retries:
description: Number of retries before incrementing canary failure count
type: number
sessionAffinity:
description: SessionAffinity represents the session affinity settings for a canary run.
type: object
Expand Down Expand Up @@ -1132,6 +1135,14 @@ spec:
additionalProperties:
type: string
type: object
webhooks:
description: Webhook status of this canary
type: object
additionalProperties:
type: object
properties:
retries:
type: number
lastAppliedSpec:
description: LastAppliedSpec of this canary
type: string
Expand Down
4 changes: 4 additions & 0 deletions pkg/apis/flagger/v1beta1/canary.go
Original file line number Diff line number Diff line change
Expand Up @@ -390,6 +390,10 @@ type CanaryWebhook struct {
// Metadata (key-value pairs) for this webhook
// +optional
Metadata *map[string]string `json:"metadata,omitempty"`

// Retries for failing webhook before incrementing rollout failure threshold
// +optional
Retries int `json:"retries,omitempty"`
}

// CanaryWebhookPayload holds the deployment info and metadata sent to webhooks
Expand Down
7 changes: 7 additions & 0 deletions pkg/apis/flagger/v1beta1/status.go
Original file line number Diff line number Diff line change
Expand Up @@ -87,4 +87,11 @@ type CanaryStatus struct {
LastTransitionTime metav1.Time `json:"lastTransitionTime,omitempty"`
// +optional
Conditions []CanaryCondition `json:"conditions,omitempty"`
// +optional
Webhooks map[string]CanaryWebhookStatus `json:"webhooks,omitempty"`
}

// CanaryWebhookStatus is a status condition for a Canary Webhook
type CanaryWebhookStatus struct {
Retries int `json:"retries"`
}
23 changes: 23 additions & 0 deletions pkg/apis/flagger/v1beta1/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions pkg/canary/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ type Controller interface {
SetStatusWeight(canary *flaggerv1.Canary, val int) error
SetStatusIterations(canary *flaggerv1.Canary, val int) error
SetStatusPhase(canary *flaggerv1.Canary, phase flaggerv1.CanaryPhase) error
SetStatusWebhookRetries(canary *flaggerv1.Canary, webhook string, val int) error
Initialize(canary *flaggerv1.Canary) error
Promote(canary *flaggerv1.Canary) error
HasTargetChanged(canary *flaggerv1.Canary) (bool, error)
Expand Down
5 changes: 5 additions & 0 deletions pkg/canary/daemonset_status.go
Original file line number Diff line number Diff line change
Expand Up @@ -71,3 +71,8 @@ func (c *DaemonSetController) SetStatusIterations(cd *flaggerv1.Canary, val int)
func (c *DaemonSetController) SetStatusPhase(cd *flaggerv1.Canary, phase flaggerv1.CanaryPhase) error {
return setStatusPhase(c.flaggerClient, cd, phase)
}

// SetStatusWebhookRetries updates the webhook retries counter
func (c *DaemonSetController) SetStatusWebhookRetries(cd *flaggerv1.Canary, webhook string, val int) error {
return SetStatusWebhookRetries(c.flaggerClient, cd, webhook, val)
}
5 changes: 5 additions & 0 deletions pkg/canary/deployment_status.go
Original file line number Diff line number Diff line change
Expand Up @@ -61,3 +61,8 @@ func (c *DeploymentController) SetStatusIterations(cd *flaggerv1.Canary, val int
func (c *DeploymentController) SetStatusPhase(cd *flaggerv1.Canary, phase flaggerv1.CanaryPhase) error {
return setStatusPhase(c.flaggerClient, cd, phase)
}

// SetStatusWebhookRetries updates the webhook retries counter
func (c *DeploymentController) SetStatusWebhookRetries(cd *flaggerv1.Canary, webhook string, val int) error {
return SetStatusWebhookRetries(c.flaggerClient, cd, webhook, val)
}
5 changes: 5 additions & 0 deletions pkg/canary/service_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,11 @@ func (c *ServiceController) SetStatusPhase(cd *flaggerv1.Canary, phase flaggerv1
return setStatusPhase(c.flaggerClient, cd, phase)
}

// SetStatusWebhookRetries updates the webhook retries counter
func (c *ServiceController) SetStatusWebhookRetries(cd *flaggerv1.Canary, webhook string, val int) error {
return SetStatusWebhookRetries(c.flaggerClient, cd, webhook, val)
}

// GetMetadata returns the pod label selector, label value and svc ports
func (c *ServiceController) GetMetadata(_ *flaggerv1.Canary) (string, string, map[string]int32, error) {
return "", "", nil, nil
Expand Down
30 changes: 30 additions & 0 deletions pkg/canary/status.go
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ func syncCanaryStatus(flaggerClient clientset.Interface, cd *flaggerv1.Canary, s
cdCopy.Status.CanaryWeight = status.CanaryWeight
cdCopy.Status.FailedChecks = status.FailedChecks
cdCopy.Status.Iterations = status.Iterations
cdCopy.Status.Webhooks = status.Webhooks
cdCopy.Status.LastAppliedSpec = hash
if status.Phase == flaggerv1.CanaryPhaseInitialized {
cdCopy.Status.LastPromotedSpec = hash
Expand Down Expand Up @@ -185,6 +186,35 @@ func setStatusPhase(flaggerClient clientset.Interface, cd *flaggerv1.Canary, pha
return nil
}

func SetStatusWebhookRetries(flaggerClient clientset.Interface, cd *flaggerv1.Canary, webhook string, val int) error {
firstTry := true
name, ns := cd.GetName(), cd.GetNamespace()
err := retry.RetryOnConflict(retry.DefaultBackoff, func() (err error) {
if !firstTry {
cd, err = flaggerClient.FlaggerV1beta1().Canaries(ns).Get(context.TODO(), name, metav1.GetOptions{})
if err != nil {
return fmt.Errorf("canary %s.%s get query failed: %w", name, ns, err)
}
}

cdCopy := cd.DeepCopy()
// initialize the map if it's nil
if cdCopy.Status.Webhooks == nil {
cdCopy.Status.Webhooks = make(map[string]flaggerv1.CanaryWebhookStatus)
}
cdCopy.Status.Webhooks[webhook] = flaggerv1.CanaryWebhookStatus{Retries: val}
cdCopy.Status.LastTransitionTime = metav1.Now()

err = updateStatusWithUpgrade(flaggerClient, cdCopy)
firstTry = false
return
})
if err != nil {
return fmt.Errorf("failed after retries: %w", err)
}
return nil
}

// getStatusCondition returns a condition based on type
func getStatusCondition(status flaggerv1.CanaryStatus, conditionType flaggerv1.CanaryConditionType) *flaggerv1.CanaryCondition {
for i := range status.Conditions {
Expand Down
39 changes: 34 additions & 5 deletions pkg/controller/scheduler.go
Original file line number Diff line number Diff line change
Expand Up @@ -430,6 +430,16 @@ func (c *Controller) advanceCanary(name string, namespace string) {
return
}
} else {
webhookStatus := c.runAnalysisWebhooks(cd, canaryController)
if webhookStatus == "failure" {
if err := canaryController.SetStatusFailedChecks(cd, cd.Status.FailedChecks+1); err != nil {
c.recordEventWarningf(cd, "%v", err)
}
return
} else if webhookStatus == "retry" {
return
}

if ok := c.runAnalysis(cd); !ok {
if err := canaryController.SetStatusFailedChecks(cd, cd.Status.FailedChecks+1); err != nil {
c.recordEventWarningf(cd, "%v", err)
Expand Down Expand Up @@ -723,19 +733,38 @@ func (c *Controller) runBlueGreen(canary *flaggerv1.Canary, canaryController can

}

func (c *Controller) runAnalysis(canary *flaggerv1.Canary) bool {
// run external checks
func (c *Controller) runAnalysisWebhooks(canary *flaggerv1.Canary, canaryController canary.Controller) string {
var retry bool
var failure bool
for _, webhook := range canary.GetAnalysis().Webhooks {
if webhook.Type == "" || webhook.Type == flaggerv1.RolloutHook {
c.recordEventInfof(canary, "Executing %s.%s analysis webhook %s", canary.Spec.TargetRef.Name, canary.Namespace, webhook.Name)
err := CallWebhook(canary.Name, canary.Namespace, flaggerv1.CanaryPhaseProgressing, webhook)
if err != nil {
c.recordEventWarningf(canary, "Halt %s.%s advancement external check %s failed %v",
canary.Name, canary.Namespace, webhook.Name, err)
return false
if webhook.Retries > 0 && canary.Status.Webhooks[webhook.Name].Retries < webhook.Retries {
if err := canaryController.SetStatusWebhookRetries(canary, webhook.Name, canary.Status.Webhooks[webhook.Name].Retries+1); err != nil {
c.recordEventWarningf(canary, "%v", err)
}
c.recordEventWarningf(canary, "Halt %s.%s advancement external check %s failed %v retries remaining %d",
canary.Name, canary.Namespace, webhook.Name, err, webhook.Retries-canary.Status.Webhooks[webhook.Name].Retries-1)
retry = true
} else {
c.recordEventWarningf(canary, "Halt %s.%s advancement external check %s failed %v",
canary.Name, canary.Namespace, webhook.Name, err)
failure = true
}
}
}
}
if failure {
return "failure"
} else if retry {
return "retry"
}
return "success"
}

func (c *Controller) runAnalysis(canary *flaggerv1.Canary) bool {
ok := c.runBuiltinMetricChecks(canary)
if !ok {
return ok
Expand Down
74 changes: 74 additions & 0 deletions pkg/controller/scheduler_deployment_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -646,3 +646,77 @@ func TestScheduler_DeploymentAlerts(t *testing.T) {
// initialization done - now send alert
mocks.ctrl.advanceCanary("podinfo", "default")
}

func TestScheduler_DeploymentWebhookRetry(t *testing.T) {
ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
w.WriteHeader(http.StatusInternalServerError)
}))
defer ts.Close()

tsOk := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
w.WriteHeader(http.StatusFound)
}))
defer ts.Close()

canary := newDeploymentTestCanary()
canary.Spec.Analysis.Webhooks = []flaggerv1.CanaryWebhook{
{
Name: "failing-url",
Type: "rollout",
URL: ts.URL,
Retries: 3,
},
{
Name: "success-url",
Type: "rollout",
URL: tsOk.URL,
Retries: 5,
},
}
mocks := newDeploymentFixture(canary)

// initializing
mocks.ctrl.advanceCanary("podinfo", "default")

// make primary ready
mocks.makePrimaryReady(t)

// initialized
mocks.ctrl.advanceCanary("podinfo", "default")
require.NoError(t, assertPhase(mocks.flaggerClient, "podinfo", flaggerv1.CanaryPhaseInitialized))

// update
dep2 := newDeploymentTestDeploymentV2()
_, err := mocks.kubeClient.AppsV1().Deployments("default").Update(context.TODO(), dep2, metav1.UpdateOptions{})
require.NoError(t, err)

// detect changes
mocks.ctrl.advanceCanary("podinfo", "default")
require.NoError(t, assertPhase(mocks.flaggerClient, "podinfo", flaggerv1.CanaryPhaseProgressing))
mocks.makeCanaryReady(t)

// progressing
mocks.ctrl.advanceCanary("podinfo", "default")
require.NoError(t, assertPhase(mocks.flaggerClient, "podinfo", flaggerv1.CanaryPhaseProgressing))

// verify webhook retry counter increases
mocks.ctrl.advanceCanary("podinfo", "default")
// Failing URL 1, Success URL 0
require.NoError(t, assertWebhookRetries(mocks.flaggerClient, "podinfo", "failing-url", 1))
require.NoError(t, assertWebhookRetries(mocks.flaggerClient, "podinfo", "success-url", 0))
// Failing URL 2, Success URL 0
mocks.ctrl.advanceCanary("podinfo", "default")
require.NoError(t, assertWebhookRetries(mocks.flaggerClient, "podinfo", "failing-url", 2))
require.NoError(t, assertWebhookRetries(mocks.flaggerClient, "podinfo", "success-url", 0))
// Failing URL 3, Success URL 0
mocks.ctrl.advanceCanary("podinfo", "default")
require.NoError(t, assertWebhookRetries(mocks.flaggerClient, "podinfo", "failing-url", 3))
require.NoError(t, assertWebhookRetries(mocks.flaggerClient, "podinfo", "success-url", 0))

// update failed checks to max
mocks.deployer.SyncStatus(mocks.canary, flaggerv1.CanaryStatus{Phase: flaggerv1.CanaryPhaseProgressing, FailedChecks: 10})

// canary failed due to failed webhook
mocks.ctrl.advanceCanary("podinfo", "default")
require.NoError(t, assertPhase(mocks.flaggerClient, "podinfo", flaggerv1.CanaryPhaseFailed))
}
13 changes: 13 additions & 0 deletions pkg/controller/scheduler_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,19 @@ func assertPhase(flaggerClient clientset.Interface, canary string, phase flagger
return nil
}

func assertWebhookRetries(flaggerClient clientset.Interface, canary string, webhook string, retries int) error {
c, err := flaggerClient.FlaggerV1beta1().Canaries("default").Get(context.TODO(), canary, metav1.GetOptions{})
if err != nil {
return err
}

if c.Status.Webhooks[webhook].Retries != retries {
return fmt.Errorf("got webhook status retries %d wanted %d %v", c.Status.Webhooks[webhook].Retries, retries, c.Status.Webhooks)
}

return nil
}

func alwaysReady() bool {
return true
}
Expand Down