diff --git a/main.go b/main.go index 9c66d99..b98dc40 100644 --- a/main.go +++ b/main.go @@ -12,13 +12,15 @@ import ( func main() { var ( - sloPath = "" - classesPath = "" - ruleOutput = "" + sloPath = "" + classesPath = "" + ruleOutput = "" + disableTicket = false ) flag.StringVar(&sloPath, "slo.path", "", "A YML file describing SLOs") flag.StringVar(&classesPath, "classes.path", "", "A YML file describing SLOs classes (optional)") flag.StringVar(&ruleOutput, "rule.output", "", "Output to describe a prometheus rules") + flag.BoolVar(&disableTicket, "disable.ticket", false, "Disable generation of alerts of kind ticket") flag.Parse() @@ -57,10 +59,10 @@ func main() { log.Fatalf("Could not compile SLO: %q, err: %q", slo.Name, err.Error()) } - ruleGroups.Groups = append(ruleGroups.Groups, slo.GenerateGroupRules(sloClass)...) + ruleGroups.Groups = append(ruleGroups.Groups, slo.GenerateGroupRules(sloClass, disableTicket)...) ruleGroups.Groups = append(ruleGroups.Groups, rulefmt.RuleGroup{ Name: "slo:" + slo.Name + ":alert", - Rules: slo.GenerateAlertRules(sloClass), + Rules: slo.GenerateAlertRules(sloClass, disableTicket), }) } diff --git a/slo/samples.go b/slo/samples.go index f942f46..4274847 100644 --- a/slo/samples.go +++ b/slo/samples.go @@ -23,3 +23,15 @@ var defaultSamples = []sample{ Buckets: []string{"1d", "3d"}, }, } + +var disabletBucketsForTickets = []string{"3d", "1d", "2h"} + +func isTicketSample(sample string) bool { + for _, bucketSample := range disabletBucketsForTickets { + if bucketSample == sample { + return true + } + } + + return false +} diff --git a/slo/slo.go b/slo/slo.go index d1a12c9..cd4e7b1 100644 --- a/slo/slo.go +++ b/slo/slo.go @@ -80,7 +80,7 @@ func (o *Objectives) LatencyBuckets() []string { return latencyBuckets } -func (slo *SLO) GenerateAlertRules(sloClass *Class) []rulefmt.Rule { +func (slo *SLO) GenerateAlertRules(sloClass *Class, disableTicket bool) []rulefmt.Rule { objectives := slo.Objectives if sloClass != nil { objectives = sloClass.Objectives @@ -104,6 +104,18 @@ func (slo *SLO) GenerateAlertRules(sloClass *Class) []rulefmt.Rule { slo.fillMetadata(&rule) } + if disableTicket { + alertRulesWithoutTicket := []rulefmt.Rule{} + + for _, rule := range alertRules { + if rule.Labels["severity"] != "ticket" { + alertRulesWithoutTicket = append(alertRulesWithoutTicket, rule) + } + } + + return alertRulesWithoutTicket + } + return alertRules } @@ -117,7 +129,7 @@ func (slo *SLO) fillMetadata(rule *rulefmt.Rule) { } } -func (slo *SLO) GenerateGroupRules(sloClass *Class) []rulefmt.RuleGroup { +func (slo *SLO) GenerateGroupRules(sloClass *Class, disableTicket bool) []rulefmt.RuleGroup { rules := []rulefmt.RuleGroup{} objectives := slo.Objectives @@ -130,6 +142,7 @@ func (slo *SLO) GenerateGroupRules(sloClass *Class) []rulefmt.RuleGroup { } for _, sample := range defaultSamples { + interval, err := model.ParseDuration(sample.Interval) if err != nil { log.Fatal(err) @@ -141,6 +154,10 @@ func (slo *SLO) GenerateGroupRules(sloClass *Class) []rulefmt.RuleGroup { } for _, bucket := range sample.Buckets { + if disableTicket && isTicketSample(bucket) { + continue + } + ruleGroup.Rules = append(ruleGroup.Rules, slo.generateRules(bucket, latencyBuckets)...) } diff --git a/slo/slo_test.go b/slo/slo_test.go index d48a8f5..71c45f8 100644 --- a/slo/slo_test.go +++ b/slo/slo_test.go @@ -47,7 +47,7 @@ func TestSLOGenerateGroupRules(t *testing.T) { }, } - groupRules := slo.GenerateGroupRules(nil) + groupRules := slo.GenerateGroupRules(nil, false) assert.Len(t, groupRules, 3) assert.Equal(t, groupRules[0], rulefmt.RuleGroup{ @@ -330,7 +330,7 @@ func TestSLOGenerateGroupRulesWithLatencyQuantile(t *testing.T) { }, } - groupRules := slo.GenerateGroupRules(nil) + groupRules := slo.GenerateGroupRules(nil, false) assert.Len(t, groupRules, 3) assert.Equal(t, rulefmt.RuleGroup{ @@ -488,7 +488,7 @@ func TestSLOGenerateGroupRulesWithAutoDiscovery(t *testing.T) { }, } - groupRules := slo.GenerateGroupRules(nil) + groupRules := slo.GenerateGroupRules(nil, false) assert.Len(t, groupRules, 3) assert.Equal(t, rulefmt.RuleGroup{ @@ -697,7 +697,7 @@ func TestSLOGenerateAlertRules(t *testing.T) { }, } - alertRules := slo.GenerateAlertRules(nil) + alertRules := slo.GenerateAlertRules(nil, false) assert.Len(t, alertRules, 4) assert.Equal(t, alertRules[0], rulefmt.Rule{ @@ -807,7 +807,7 @@ func TestSLOGenerateAlertRulesWithoutExpressions(t *testing.T) { }, } - alertRules := slo.GenerateAlertRules(nil) + alertRules := slo.GenerateAlertRules(nil, false) assert.Len(t, alertRules, 4) assert.Equal(t, alertRules[0], rulefmt.Rule{ @@ -927,7 +927,7 @@ func TestSLOGenerateAlertRulesWithSLOCLass(t *testing.T) { }, } - alertRules := slo.GenerateAlertRules(sloClass) + alertRules := slo.GenerateAlertRules(sloClass, false) assert.Len(t, alertRules, 4) assert.Equal(t, alertRules[0], rulefmt.Rule{ @@ -1004,6 +1004,277 @@ func TestSLOGenerateAlertRulesWithSLOCLass(t *testing.T) { Annotations: slo.Annotations, }) - alertRules = slo.GenerateAlertRules(noLatencyClass) + alertRules = slo.GenerateAlertRules(noLatencyClass, false) assert.Len(t, alertRules, 2) } + +func TestSLOGenerateAlertRulesWithoutTickets(t *testing.T) { + slo := &SLO{ + Name: "my-team.my-service.payment", + Objectives: Objectives{ + Availability: 99.9, + Latency: []methods.LatencyTarget{ + { + LE: "0.1", + Target: 95, + }, + { + LE: "0.5", + Target: 99, + }, + }, + }, + ErrorRateRecord: ExprBlock{ + AlertMethod: "multi-window", + }, + LatencyRecord: ExprBlock{ + AlertMethod: "multi-window", + }, + Labels: map[string]string{ + "channel": "my-channel", + }, + Annotations: map[string]string{ + "message": "Service A has lower SLI", + "link": "http://wiki.ops/1234", + "dashboard": "http://grafana.globo.com", + }, + } + + alertRules := slo.GenerateAlertRules(nil, true) + assert.Len(t, alertRules, 2) + + assert.Equal(t, alertRules[0], rulefmt.Rule{ + Alert: "slo:my-team.my-service.payment.errors.page", + Expr: "(slo:service_errors_total:ratio_rate_1h{service=\"my-team.my-service.payment\"} > (14.4 * 0.001) and slo:service_errors_total:ratio_rate_5m{service=\"my-team.my-service.payment\"} > (14.4 * 0.001)) or (slo:service_errors_total:ratio_rate_6h{service=\"my-team.my-service.payment\"} > (6 * 0.001) and slo:service_errors_total:ratio_rate_30m{service=\"my-team.my-service.payment\"} > (6 * 0.001))", + Labels: map[string]string{ + "channel": "my-channel", + "severity": "page", + }, + Annotations: slo.Annotations, + }) + + assert.Equal(t, alertRules[1], rulefmt.Rule{ + Alert: "slo:my-team.my-service.payment.latency.page", + Expr: ("(" + + "slo:service_latency:ratio_rate_1h{le=\"0.1\", service=\"my-team.my-service.payment\"} < 0.28" + + " and " + + "slo:service_latency:ratio_rate_5m{le=\"0.1\", service=\"my-team.my-service.payment\"} < 0.28" + + ") or (" + + "slo:service_latency:ratio_rate_6h{le=\"0.1\", service=\"my-team.my-service.payment\"} < 0.7" + + " and " + + "slo:service_latency:ratio_rate_30m{le=\"0.1\", service=\"my-team.my-service.payment\"} < 0.7" + + ") or (" + + "slo:service_latency:ratio_rate_1h{le=\"0.5\", service=\"my-team.my-service.payment\"} < 0.856" + + " and " + + "slo:service_latency:ratio_rate_5m{le=\"0.5\", service=\"my-team.my-service.payment\"} < 0.856" + + ") or (" + + "slo:service_latency:ratio_rate_6h{le=\"0.5\", service=\"my-team.my-service.payment\"} < 0.94" + + " and " + + "slo:service_latency:ratio_rate_30m{le=\"0.5\", service=\"my-team.my-service.payment\"} < 0.94" + + ")"), + + Labels: map[string]string{ + "channel": "my-channel", + "severity": "page", + }, + Annotations: slo.Annotations, + }) + +} + +func TestSLOGenerateGroupRulesWithoutTickets(t *testing.T) { + slo := &SLO{ + Name: "my-team.my-service.payment", + Objectives: Objectives{ + Availability: 99.9, + Latency: []methods.LatencyTarget{ + { + LE: "0.1", + Target: 90, + }, + { + LE: "0.5", + Target: 99, + }, + }, + }, + TrafficRateRecord: ExprBlock{ + Expr: "sum(rate(http_total[$window]))", + }, + ErrorRateRecord: ExprBlock{ + AlertMethod: "multi-window", + Expr: "sum(rate(http_errors[$window]))/sum(rate(http_total[$window]))", + }, + LatencyRecord: ExprBlock{ + AlertMethod: "multi-window", + Expr: "sum(rate(http_bucket{le=\"$le\"}[$window]))/sum(rate(http_total[$window]))", + }, + Labels: map[string]string{ + "team": "team-avengers", + }, + Annotations: map[string]string{ + "message": "Service A has lower SLI", + "link": "http://wiki.ops/1234", + "dashboard": "http://grafana.globo.com", + }, + } + + groupRules := slo.GenerateGroupRules(nil, true) + assert.Len(t, groupRules, 2) + + assert.Equal(t, groupRules[0], rulefmt.RuleGroup{ + Name: "slo:my-team.my-service.payment:short", + Interval: model.Duration(time.Second * 30), + Rules: []rulefmt.Rule{ + // 5m + { + Record: "slo:service_traffic:ratio_rate_5m", + Expr: "sum(rate(http_total[5m]))", + Labels: map[string]string{ + "service": "my-team.my-service.payment", + "team": "team-avengers", + }, + }, + { + Record: "slo:service_errors_total:ratio_rate_5m", + Expr: "sum(rate(http_errors[5m]))/sum(rate(http_total[5m]))", + Labels: map[string]string{ + "service": "my-team.my-service.payment", + "team": "team-avengers", + }, + }, + { + Record: "slo:service_latency:ratio_rate_5m", + Expr: "sum(rate(http_bucket{le=\"0.1\"}[5m]))/sum(rate(http_total[5m]))", + Labels: map[string]string{ + "service": "my-team.my-service.payment", + "team": "team-avengers", + "le": "0.1", + }, + }, + { + Record: "slo:service_latency:ratio_rate_5m", + Expr: "sum(rate(http_bucket{le=\"0.5\"}[5m]))/sum(rate(http_total[5m]))", + Labels: map[string]string{ + "service": "my-team.my-service.payment", + "team": "team-avengers", + "le": "0.5", + }, + }, + + // 30m + { + Record: "slo:service_traffic:ratio_rate_30m", + Expr: "sum(rate(http_total[30m]))", + Labels: map[string]string{ + "service": "my-team.my-service.payment", + "team": "team-avengers", + }, + }, + { + Record: "slo:service_errors_total:ratio_rate_30m", + Expr: "sum(rate(http_errors[30m]))/sum(rate(http_total[30m]))", + Labels: map[string]string{ + "service": "my-team.my-service.payment", + "team": "team-avengers", + }, + }, + { + Record: "slo:service_latency:ratio_rate_30m", + Expr: "sum(rate(http_bucket{le=\"0.1\"}[30m]))/sum(rate(http_total[30m]))", + Labels: map[string]string{ + "service": "my-team.my-service.payment", + "le": "0.1", + "team": "team-avengers", + }, + }, + { + Record: "slo:service_latency:ratio_rate_30m", + Expr: "sum(rate(http_bucket{le=\"0.5\"}[30m]))/sum(rate(http_total[30m]))", + Labels: map[string]string{ + "service": "my-team.my-service.payment", + "le": "0.5", + "team": "team-avengers", + }, + }, + + // 1h + { + Record: "slo:service_traffic:ratio_rate_1h", + Expr: "sum(rate(http_total[1h]))", + Labels: map[string]string{ + "service": "my-team.my-service.payment", + "team": "team-avengers", + }, + }, + { + Record: "slo:service_errors_total:ratio_rate_1h", + Expr: "sum(rate(http_errors[1h]))/sum(rate(http_total[1h]))", + Labels: map[string]string{ + "service": "my-team.my-service.payment", + "team": "team-avengers", + }, + }, + { + Record: "slo:service_latency:ratio_rate_1h", + Expr: "sum(rate(http_bucket{le=\"0.1\"}[1h]))/sum(rate(http_total[1h]))", + Labels: map[string]string{ + "service": "my-team.my-service.payment", + "team": "team-avengers", + "le": "0.1", + }, + }, + { + Record: "slo:service_latency:ratio_rate_1h", + Expr: "sum(rate(http_bucket{le=\"0.5\"}[1h]))/sum(rate(http_total[1h]))", + Labels: map[string]string{ + "service": "my-team.my-service.payment", + "le": "0.5", + "team": "team-avengers", + }, + }, + }, + }) + + assert.Equal(t, groupRules[1], rulefmt.RuleGroup{ + Name: "slo:my-team.my-service.payment:medium", + Interval: model.Duration(time.Second * 120), + Rules: []rulefmt.Rule{ + // 6h + { + Record: "slo:service_traffic:ratio_rate_6h", + Expr: "sum(rate(http_total[6h]))", + Labels: map[string]string{ + "service": "my-team.my-service.payment", + "team": "team-avengers", + }, + }, + { + Record: "slo:service_errors_total:ratio_rate_6h", + Expr: "sum(rate(http_errors[6h]))/sum(rate(http_total[6h]))", + Labels: map[string]string{ + "service": "my-team.my-service.payment", + "team": "team-avengers", + }, + }, + { + Record: "slo:service_latency:ratio_rate_6h", + Expr: "sum(rate(http_bucket{le=\"0.1\"}[6h]))/sum(rate(http_total[6h]))", + Labels: map[string]string{ + "service": "my-team.my-service.payment", + "le": "0.1", + "team": "team-avengers", + }, + }, + { + Record: "slo:service_latency:ratio_rate_6h", + Expr: "sum(rate(http_bucket{le=\"0.5\"}[6h]))/sum(rate(http_total[6h]))", + Labels: map[string]string{ + "team": "team-avengers", + "service": "my-team.my-service.payment", + "le": "0.5", + }, + }, + }, + }) +}