Skip to content

Commit

Permalink
Add support to latency alerts
Browse files Browse the repository at this point in the history
  • Loading branch information
wpjunior committed Aug 6, 2019
1 parent 113c717 commit 361243a
Show file tree
Hide file tree
Showing 5 changed files with 159 additions and 77 deletions.
6 changes: 6 additions & 0 deletions algorithms/algorithm.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import "github.com/prometheus/prometheus/pkg/rulefmt"

type AlertAlgorithm interface {
AlertForError(serviceName string, availabilityTarget float64, annotations map[string]string) []rulefmt.Rule
AlertForLatency(serviceName string, targets []LatencyTarget, annotations map[string]string) []rulefmt.Rule
}

var algorithms = map[string]AlertAlgorithm{}
Expand All @@ -16,3 +17,8 @@ func register(algorithm AlertAlgorithm, name string) AlertAlgorithm {
func Get(name string) AlertAlgorithm {
return algorithms[name]
}

type LatencyTarget struct {
LE string `yaml:"le"`
Target float64 `yaml:"target"`
}
111 changes: 84 additions & 27 deletions algorithms/multi-window.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ package algoritms

import (
"fmt"
"strings"

"github.com/prometheus/prometheus/pkg/labels"
"github.com/prometheus/prometheus/pkg/rulefmt"
Expand All @@ -13,12 +14,11 @@ func (*MultiWindowAlgorithm) AlertForError(serviceName string, availabilityTarge
rules := []rulefmt.Rule{
{
Alert: "slo:" + serviceName + ".errors.page",
Expr: multiBurnRate(MultiRateOpts{
Metric: "slo:service_errors_total",
Labels: labels.New(labels.Label{"service", serviceName}),
Operator: ">",
Value: (1 - availabilityTarget/100),
Kind: "page",
Expr: multiBurnRate(MultiRateErrorOpts{
Metric: "slo:service_errors_total",
Labels: labels.New(labels.Label{"service", serviceName}),
Value: (1 - availabilityTarget/100),
Kind: "page",
}),
Annotations: annotations,
Labels: map[string]string{
Expand All @@ -27,29 +27,67 @@ func (*MultiWindowAlgorithm) AlertForError(serviceName string, availabilityTarge
},
{
Alert: "slo:" + serviceName + ".errors.ticket",
Expr: multiBurnRate(MultiRateOpts{
Metric: "slo:service_errors_total",
Labels: labels.New(labels.Label{"service", serviceName}),
Operator: ">",
Value: (1 - availabilityTarget/100),
Kind: "ticket",
Expr: multiBurnRate(MultiRateErrorOpts{
Metric: "slo:service_errors_total",
Labels: labels.New(labels.Label{"service", serviceName}),
Value: (1 - availabilityTarget/100),
Kind: "ticket",
}),
Annotations: annotations,
Labels: map[string]string{
"severity": "ticket",
},
},
}
// rulefmt.Rule

return rules
}

func (*MultiWindowAlgorithm) AlertForLatency(serviceName string, targets []LatencyTarget, annotations map[string]string) []rulefmt.Rule {
rules := []rulefmt.Rule{
{
Alert: "slo:" + serviceName + ".latency.page",
Expr: multiBurnRateLatency(MultiRateLatencyOpts{
Metric: "slo:service_latency",
Label: labels.Label{"service", serviceName},
Buckets: targets,
Kind: "page",
}),
Annotations: annotations,
Labels: map[string]string{
"severity": "page",
},
},
{
Alert: "slo:" + serviceName + ".latency.ticket",
Expr: multiBurnRateLatency(MultiRateLatencyOpts{
Metric: "slo:service_latency",
Label: labels.Label{"service", serviceName},
Buckets: targets,
Kind: "ticket",
}),
Annotations: annotations,
Labels: map[string]string{
"severity": "ticket",
},
},
}

return rules
}

type MultiRateOpts struct {
Metric string
Labels labels.Labels
Operator string
Value float64
Kind string // page or ticket
type MultiRateErrorOpts struct {
Metric string
Labels labels.Labels
Value float64
Kind string // page or ticket
}

type MultiRateLatencyOpts struct {
Metric string
Label labels.Label
Buckets []LatencyTarget
Kind string // page or ticket
}

type MultiRateWindow [2]struct {
Expand Down Expand Up @@ -85,19 +123,38 @@ var multiRateWindows = map[string]MultiRateWindow{
},
}

func multiBurnRate(opts MultiRateOpts) string {
func multiBurnRate(opts MultiRateErrorOpts) string {
multiRateWindow := multiRateWindows[opts.Kind]
conditions := []string{"", ""}

for index, window := range multiRateWindow {
condition := fmt.Sprintf(`(%s:ratio_rate_%s%s > (%g * %.3g) and `, opts.Metric, window.LongWindow, opts.Labels.String(), window.Multiplier, opts.Value)
condition += fmt.Sprintf(`%s:ratio_rate_%s%s > (%g * %.3g))`, opts.Metric, window.ShortWindow, opts.Labels.String(), window.Multiplier, opts.Value)

conditions[index] = condition
}

return strings.Join(conditions, " or ")
}

func multiBurnRateLatency(opts MultiRateLatencyOpts) string {
multiRateWindow := multiRateWindows[opts.Kind]
conditions := []string{}

result := ""
result += fmt.Sprintf(`(%s:ratio_rate_%s%s %s (%g * %.3g) and `, opts.Metric, multiRateWindow[0].LongWindow, opts.Labels.String(), opts.Operator, multiRateWindow[0].Multiplier, opts.Value)
result += fmt.Sprintf(`%s:ratio_rate_%s%s %s (%g * %.3g))`, opts.Metric, multiRateWindow[0].ShortWindow, opts.Labels.String(), opts.Operator, multiRateWindow[0].Multiplier, opts.Value)
for _, bucket := range opts.Buckets {
for _, window := range multiRateWindow {
value := (1 - ((100 - bucket.Target) * window.Multiplier * 0.01))

result += " or "
lbs := labels.New(opts.Label, labels.Label{"le", bucket.LE})

result += fmt.Sprintf(`(%s:ratio_rate_%s%s %s (%g * %.3g) and `, opts.Metric, multiRateWindow[1].LongWindow, opts.Labels.String(), opts.Operator, multiRateWindow[1].Multiplier, opts.Value)
result += fmt.Sprintf(`%s:ratio_rate_%s%s %s (%g * %.3g))`, opts.Metric, multiRateWindow[1].ShortWindow, opts.Labels.String(), opts.Operator, multiRateWindow[1].Multiplier, opts.Value)
condition := fmt.Sprintf(`(%s:ratio_rate_%s%s < %.3g and `, opts.Metric, window.LongWindow, lbs.String(), value)
condition += fmt.Sprintf(`%s:ratio_rate_%s%s < %.3g)`, opts.Metric, window.ShortWindow, lbs.String(), value)

conditions = append(conditions, condition)
}
}

return result
return strings.Join(conditions, " or ")
}

var _ = register(&MultiWindowAlgorithm{}, "multi-window")
4 changes: 4 additions & 0 deletions main.go
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,10 @@ func main() {

for _, slo := range spec.SLOS {
ruleGroups.Groups = append(ruleGroups.Groups, slo.GenerateGroupRules()...)
ruleGroups.Groups = append(ruleGroups.Groups, rulefmt.RuleGroup{
Name: "slo:" + slo.Name + ":alert",
Rules: slo.GenerateAlertRules(),
})
}

targetFile, err := os.Create(ruleOutput)
Expand Down
54 changes: 8 additions & 46 deletions slo/slo.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,17 +24,12 @@ func (block *ExprBlock) ComputeExpr(window, le string) string {
}

type SLO struct {
Name string `yaml:"name"`
AvailabilityObjectivePercent float64 `yaml:"availabilityObjectivePercent"`
LatencyObjectiveBuckets []LatencyBucket `yaml:"latencyObjectiveBuckets"`
ErrorRateRecord ExprBlock `yaml:"errorRateRecord"`
LatencyRecord ExprBlock `yaml:"latencyRecord"`
Annotations map[string]string `yaml:"annotations"`
}

type LatencyBucket struct {
LE string `yaml:"le"`
Target float64 `yaml:"target"`
Name string `yaml:"name"`
AvailabilityObjectivePercent float64 `yaml:"availabilityObjectivePercent"`
LatencyObjectiveBuckets []algorithms.LatencyTarget `yaml:"latencyObjectiveBuckets"`
ErrorRateRecord ExprBlock `yaml:"errorRateRecord"`
LatencyRecord ExprBlock `yaml:"latencyRecord"`
Annotations map[string]string `yaml:"annotations"`
}

func (slo SLO) GenerateAlertRules() []rulefmt.Rule {
Expand All @@ -48,42 +43,9 @@ func (slo SLO) GenerateAlertRules() []rulefmt.Rule {

latencyAlgorithm := algorithms.Get(slo.LatencyRecord.AlertAlgorithm)
if latencyAlgorithm != nil {

latencyRules := errorAlgorithm.AlertForLatency(slo.Name, slo.LatencyObjectiveBuckets, slo.Annotations)
alertRules = append(alertRules, latencyRules...)
}
// if slo.Algorithm == "multiwindow" {
// // alerting page
// sloPageRecord := rulefmt.Rule{
// Alert: "slo:" + slo.Name + ".errors.page",
// Expr: algorithms.MultiBurnRateForPage(
// "slo:service_errors_total",
// labels.New(labels.Label{"service", slo.Name}),
// ">", (1 - slo.AvailabilityObjectivePercent/100),
// ),
// Annotations: slo.Annotations,
// Labels: map[string]string{
// "severity": "page",
// },
// }

// alertRules = append(alertRules, sloPageRecord)

// // alerting ticket
// sloTicketRecord := rulefmt.Rule{
// Alert: "slo:" + slo.Name + ".errors.ticket",
// Expr: algorithms.MultiBurnRateForTicket(
// "slo:service_errors_total",
// labels.New(labels.Label{"service", slo.Name}),
// ">", (1 - slo.AvailabilityObjectivePercent/100),
// ),
// Annotations: slo.Annotations,
// Labels: map[string]string{
// "severity": "ticket",
// },
// }

// alertRules = append(alertRules, sloTicketRecord)

// }

return alertRules
}
Expand Down
61 changes: 57 additions & 4 deletions slo/slo_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import (
"testing"
"time"

algorithms "github.com/globocom/slo-generator/algorithms"
"github.com/prometheus/common/model"
"github.com/prometheus/prometheus/pkg/rulefmt"
"github.com/stretchr/testify/assert"
Expand All @@ -13,7 +14,7 @@ func TestSLOGenerateGroupRules(t *testing.T) {
slo := &SLO{
Name: "my-team.my-service.payment",
AvailabilityObjectivePercent: 99.9,
LatencyObjectiveBuckets: []LatencyBucket{
LatencyObjectiveBuckets: []algorithms.LatencyTarget{
{
LE: "0.1",
Target: 90,
Expand Down Expand Up @@ -239,10 +240,10 @@ func TestSLOGenerateAlertRules(t *testing.T) {
slo := &SLO{
Name: "my-team.my-service.payment",
AvailabilityObjectivePercent: 99.9,
LatencyObjectiveBuckets: []LatencyBucket{
LatencyObjectiveBuckets: []algorithms.LatencyTarget{
{
LE: "0.1",
Target: 90,
Target: 95,
},
{
LE: "0.5",
Expand All @@ -265,7 +266,7 @@ func TestSLOGenerateAlertRules(t *testing.T) {
}

alertRules := slo.GenerateAlertRules()
assert.Len(t, alertRules, 2)
assert.Len(t, alertRules, 4)

assert.Equal(t, alertRules[0], rulefmt.Rule{
Alert: "slo:my-team.my-service.payment.errors.page",
Expand All @@ -284,4 +285,56 @@ func TestSLOGenerateAlertRules(t *testing.T) {
},
Annotations: slo.Annotations,
})

assert.Equal(t, alertRules[2], rulefmt.Rule{
Alert: "slo:my-team.my-service.payment.latency.page",
Expr: ("(" +
"slo:service_latency:ratio_rate_1h{le=\"0.1\", service=\"my-team.my-service.payment\"} < 0.28" +
" and " +
"slo:service_latency:ratio_rate_5m{le=\"0.1\", service=\"my-team.my-service.payment\"} < 0.28" +
") or (" +
"slo:service_latency:ratio_rate_6h{le=\"0.1\", service=\"my-team.my-service.payment\"} < 0.7" +
" and " +
"slo:service_latency:ratio_rate_30m{le=\"0.1\", service=\"my-team.my-service.payment\"} < 0.7" +
") or (" +
"slo:service_latency:ratio_rate_1h{le=\"0.5\", service=\"my-team.my-service.payment\"} < 0.856" +
" and " +
"slo:service_latency:ratio_rate_5m{le=\"0.5\", service=\"my-team.my-service.payment\"} < 0.856" +
") or (" +
"slo:service_latency:ratio_rate_6h{le=\"0.5\", service=\"my-team.my-service.payment\"} < 0.94" +
" and " +
"slo:service_latency:ratio_rate_30m{le=\"0.5\", service=\"my-team.my-service.payment\"} < 0.94" +
")"),

Labels: map[string]string{
"severity": "page",
},
Annotations: slo.Annotations,
})

assert.Equal(t, alertRules[3], rulefmt.Rule{
Alert: "slo:my-team.my-service.payment.latency.ticket",
Expr: ("(" +
"slo:service_latency:ratio_rate_1d{le=\"0.1\", service=\"my-team.my-service.payment\"} < 0.85" +
" and " +
"slo:service_latency:ratio_rate_2h{le=\"0.1\", service=\"my-team.my-service.payment\"} < 0.85" +
") or (" +
"slo:service_latency:ratio_rate_3d{le=\"0.1\", service=\"my-team.my-service.payment\"} < 0.95" +
" and " +
"slo:service_latency:ratio_rate_6h{le=\"0.1\", service=\"my-team.my-service.payment\"} < 0.95" +
") or (" +
"slo:service_latency:ratio_rate_1d{le=\"0.5\", service=\"my-team.my-service.payment\"} < 0.97" +
" and " +
"slo:service_latency:ratio_rate_2h{le=\"0.5\", service=\"my-team.my-service.payment\"} < 0.97" +
") or (" +
"slo:service_latency:ratio_rate_3d{le=\"0.5\", service=\"my-team.my-service.payment\"} < 0.99" +
" and " +
"slo:service_latency:ratio_rate_6h{le=\"0.5\", service=\"my-team.my-service.payment\"} < 0.99" +
")"),

Labels: map[string]string{
"severity": "ticket",
},
Annotations: slo.Annotations,
})
}

0 comments on commit 361243a

Please sign in to comment.