Initial commit

globocom · Aug 5, 2019 · a3f0f49 · a3f0f49
1 parent 5373e9e
commit a3f0f49
Show file tree

Hide file tree

Showing 628 changed files with 299,203 additions and 0 deletions.
diff --git a/algorithms/algorithms.go b/algorithms/algorithms.go
@@ -0,0 +1,33 @@
+package algoritms
+
+import (
+	"fmt"
+
+	"github.com/prometheus/prometheus/pkg/labels"
+)
+
+func MultiBurnRateForPage(metric string, lbs labels.Labels, operator string, value float64) string {
+	result := ""
+	result += fmt.Sprintf(`(%s:ratio_rate_1h%s %s (14.4 * %.3f) and `, metric, lbs.String(), operator, value)
+	result += fmt.Sprintf(`%s:ratio_rate_5m%s %s (14.4 * %.3f))`, metric, lbs.String(), operator, value)
+
+	result += " or "
+
+	result += fmt.Sprintf(`(%s:ratio_rate_6h%s %s (6 * %.3f) and `, metric, lbs.String(), operator, value)
+	result += fmt.Sprintf(`%s:ratio_rate_30m%s %s (6 * %.3f))`, metric, lbs.String(), operator, value)
+
+	return result
+}
+
+func MultiBurnRateForTicket(metric string, lbs labels.Labels, operator string, value float64) string {
+	result := ""
+	result += fmt.Sprintf(`(%s:ratio_rate_1d%s %s (3 * %.3f) and `, metric, lbs.String(), operator, value)
+	result += fmt.Sprintf(`%s:ratio_rate_2h%s %s (3 * %.3f))`, metric, lbs.String(), operator, value)
+
+	result += " or "
+
+	result += fmt.Sprintf(`(%s:ratio_rate_3d%s %s %.3f and `, metric, lbs.String(), operator, value)
+	result += fmt.Sprintf(`%s:ratio_rate_6h%s %s %.3f)`, metric, lbs.String(), operator, value)
+
+	return result
+}
diff --git a/go.mod b/go.mod
@@ -0,0 +1,11 @@
+module github.com/globocom/slo-generator
+
+go 1.12
+
+require (
+	github.com/opentracing/opentracing-go v1.1.0 // indirect
+	github.com/prometheus/common v0.6.0
+	github.com/prometheus/prometheus v0.0.0-20190525122359-d20e84d0fb64
+	github.com/prometheus/tsdb v0.10.0 // indirect
+	gopkg.in/yaml.v2 v2.2.2
+)
diff --git a/go.sum b/go.sum
diff --git a/main.go b/main.go
@@ -0,0 +1,176 @@
+package main
+
+import (
+	"flag"
+	"log"
+	"os"
+	"strings"
+
+	algorithms "github.com/globocom/slo-generator/algorithms"
+	"github.com/prometheus/common/model"
+	"github.com/prometheus/prometheus/pkg/labels"
+	"github.com/prometheus/prometheus/pkg/rulefmt"
+	yaml "gopkg.in/yaml.v2"
+)
+
+type Sample struct {
+	Name     string
+	Interval string
+	Buckets  []string
+}
+
+var defaultSamples = []Sample{
+	{
+		Name:     "short",
+		Interval: "30s",
+		Buckets:  []string{"5m", "30m", "1h"},
+	},
+	{
+		Name:     "medium",
+		Interval: "2m",
+		Buckets:  []string{"2h", "6h"},
+	},
+	{
+		Name:     "daily",
+		Interval: "5m",
+		Buckets:  []string{"1d", "3d"},
+	},
+}
+
+type SLOSpec struct {
+	SLOS []SLO
+}
+
+type ExprBlock struct {
+	Expr string `yaml:"expr"`
+}
+
+func (block *ExprBlock) ComputeExpr(window string) string {
+	replacer := strings.NewReplacer("$window", window)
+	return replacer.Replace(block.Expr)
+}
+
+type SLO struct {
+	Name                         string             `yaml:"name"`
+	Algorithm                    string             `yaml:"algorithm"`
+	AvailabilityObjectivePercent float64            `yaml:"availabilityObjectivePercent"`
+	LatencyObjectiveBuckets      map[float64]string `yaml:"latencyObjectiveBuckets"`
+	ErrorRateRecord              ExprBlock          `yaml:"errorRateRecord"`
+	LatencyRecord                ExprBlock          `yaml:"latencyRecord"`
+	Annotations                  map[string]string  `yaml:"annotations"`
+}
+
+func (slo SLO) GenerateGroupRules() []rulefmt.RuleGroup {
+	rules := []rulefmt.RuleGroup{}
+
+	for _, sample := range defaultSamples {
+		interval, err := model.ParseDuration(sample.Interval)
+		if err != nil {
+			log.Fatal(err)
+		}
+		ruleGroup := rulefmt.RuleGroup{
+			Name:     "slo:" + slo.Name + "_" + sample.Name,
+			Interval: interval,
+			Rules:    []rulefmt.Rule{},
+		}
+
+		for _, bucket := range sample.Buckets {
+			errorRateRecord := rulefmt.Rule{
+				Record: "slo:service_errors_total:ratio_rate_" + bucket,
+				Expr:   slo.ErrorRateRecord.ComputeExpr(bucket),
+				Labels: map[string]string{
+					"service": slo.Name,
+				},
+			}
+
+			ruleGroup.Rules = append(ruleGroup.Rules, errorRateRecord)
+		}
+
+		rules = append(rules, ruleGroup)
+	}
+
+	// alerting
+	alertingGroup := rulefmt.RuleGroup{
+		Name:  "slo:" + slo.Name + "_alert",
+		Rules: []rulefmt.Rule{},
+	}
+
+	// alerting page
+	sloPageRecord := rulefmt.Rule{
+		Alert: "slo:" + slo.Name + ".errors.page",
+		Expr: algorithms.MultiBurnRateForPage(
+			"slo:service_errors_total",
+			labels.New(labels.Label{"service", slo.Name}),
+			"<", (1 - slo.AvailabilityObjectivePercent/100),
+		),
+		Annotations: slo.Annotations,
+	}
+
+	alertingGroup.Rules = append(alertingGroup.Rules, sloPageRecord)
+
+	// alerting ticket
+	sloTicketRecord := rulefmt.Rule{
+		Alert: "slo:" + slo.Name + ".errors.ticket",
+		Expr: algorithms.MultiBurnRateForTicket(
+			"slo:service_errors_total",
+			labels.New(labels.Label{"service", slo.Name}),
+			"<", (1 - slo.AvailabilityObjectivePercent/100),
+		),
+		Annotations: slo.Annotations,
+	}
+
+	alertingGroup.Rules = append(alertingGroup.Rules, sloTicketRecord)
+
+	rules = append(rules, alertingGroup)
+
+	return rules
+}
+
+func main() {
+	var (
+		sloPath    = ""
+		ruleOutput = ""
+	)
+	flag.StringVar(&sloPath, "slo.path", "", "A YML file describing SLOs")
+	flag.StringVar(&ruleOutput, "rule.output", "", "Output to describe a prometheus rules")
+
+	flag.Parse()
+
+	if sloPath == "" {
+		log.Fatal("slo.path is a required param")
+	}
+
+	if ruleOutput == "" {
+		log.Fatal("rule.output is a required param")
+	}
+
+	f, err := os.Open(sloPath)
+	if err != nil {
+		log.Fatal(err)
+	}
+
+	spec := &SLOSpec{}
+	err = yaml.NewDecoder(f).Decode(spec)
+	if err != nil {
+		log.Fatal(err)
+	}
+
+	ruleGroups := &rulefmt.RuleGroups{
+		Groups: []rulefmt.RuleGroup{},
+	}
+
+	for _, slo := range spec.SLOS {
+		ruleGroups.Groups = append(ruleGroups.Groups, slo.GenerateGroupRules()...)
+	}
+
+	targetFile, err := os.Create(ruleOutput)
+	if err != nil {
+		log.Fatal(err)
+	}
+	defer targetFile.Close()
+	err = yaml.NewEncoder(targetFile).Encode(ruleGroups)
+	if err != nil {
+		log.Fatal(err)
+	}
+	log.Printf("generated a SLO record in %q", ruleOutput)
+}
diff --git a/slo_example.yml b/slo_example.yml
@@ -0,0 +1,45 @@
+slos:
+  - name: myteam-a.service-a
+    algorithm: multiwindow
+    availabilityObjectivePercent: 99
+    latencyObjectiveBuckets:
+      95: 5s   # 95% < 5s
+      97: 10s  # 97% < 10s
+
+    annotations:
+      message: Service A Error Budget consumption
+      link: https://grafana.myservice.com/URL
+      slack_channel: '_team_a'
+
+    errorRateRecord:
+      expr: |
+        sum (rate(http_requests_total{job="service-a", status="5xx"}[$window])) /
+        sum (rate(http_requests_total{job="service-a"}[$window]))
+
+    latencyRecord:
+      expr: |
+        sum (rate(http_request_duration_seconds_bucket{job="service-a", le="{{ $latencyBucket }}"}[$window])) /
+        sum (rate(http_requests_total{job="service-a"}[$window]))
+
+
+  - name: myteam-b.service-b
+    algorithm: multiwindow
+    availabilityObjectivePercent: 99.9
+    latencyObjectiveBuckets:
+      90: 50ms   # 90% < 50ms
+      95: 100ms  # 95% < 100ms
+
+    annotations:
+      message: Service B Error Budget consumption
+      link:
+      slack_channel: '_team_b'
+
+    errorRateRecord:
+      expr: |
+        sum (rate(http_requests_total{job="service-b", status="5xx"}[$window])) /
+        sum (rate(http_requests_total{job="service-b"}[$window]))
+
+    latencyRecord:
+      expr: |
+        sum (rate(http_request_duration_seconds_bucket{job="service-b", le="{{ $latencyBucket }}"}[$window])) /
+        sum (rate(http_requests_total{job="service-b"}[$window]))
diff --git a/vendor/github.com/beorn7/perks/LICENSE b/vendor/github.com/beorn7/perks/LICENSE