Skip to content

Commit

Permalink
Try to resolve missing series from recording rules
Browse files Browse the repository at this point in the history
  • Loading branch information
prymitive committed Mar 17, 2022
1 parent 9561783 commit f1f4cdd
Show file tree
Hide file tree
Showing 4 changed files with 351 additions and 42 deletions.
46 changes: 45 additions & 1 deletion docs/checks/promql/series.md
Expand Up @@ -6,7 +6,7 @@ grand_parent: Documentation

# promql/series

This check will also query Prometheus servers, it is used to warn about queries
This check will query Prometheus servers, it is used to warn about queries
that are using metrics not currently present in Prometheus.
It parses `expr` query from every rule, finds individual metric selectors and
runs a series of checks for each of them.
Expand All @@ -26,6 +26,50 @@ to determine why, by checking if:
If you see this check complaining about some metric it's might due to a number
of different issues. Here are some usual cases.

## Your query is using ALERTS or ALERTS_FOR_STATE metrics

Prometheus itself exposes metrics about active alerts [see docs here](https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/#inspecting-alerts-during-runtime.
And it's possible to use those metrics in recording or alerting rules.
If pint finds a query using either `ALERTS{alertname="..."}` or
`ALERTS_FOR_STATE{alertname="..."}` selector it will check if there's
alerting rule with matching name defined. For queries that don't pass any
`alertname` label filters it will skip any further checks.

## Your query is using recording rules

If a metric isn't present in Prometheus but pint finds a recording rule
with matching name then it will try to use that recording rule instead.

This should help with CI checks where multiple rules are added at once
and one depends on the other.

Example with alert rule that depends on two recording rules:

```yaml
# count the number of targets per job
- record: job:up:count
expr: count(up) by(job)

# total number of targets that are healthy per job
- record: job:up:sum
expr: sum(up) by(job)

# alert if <50% of targets are down
- alert: Too Many Targets Are Down
expr: (job:up:sum / job:up:count) < 0.5
```

If all three rules where added in a single PR and pint didn't try to match
metrics to recording rule then `pint ci` would block such PR because metrics
this alert is using are not present in Prometheus.
By trying to match metrics to recording rules pint can use those rules
as as substitute for missing metrics and better validate such PR.

**NOTE**: Checking recording rules instead of real metrics present in Prometheus
can be less accurate and might not spot some issues, like missing labels.
For most accurate validation via `pint ci` it's best to first add recording
rules before adding alerting rules that depend on them.

### Your query cannot return anything

- You are trying to use a metric that is not present in Prometheus at all.
Expand Down
13 changes: 11 additions & 2 deletions internal/checks/base_test.go
Expand Up @@ -74,6 +74,7 @@ type checkTest struct {
description string
content string
checker newCheckFn
entries []discovery.Entry
problems problemsFn
mocks []*prometheusMock
}
Expand Down Expand Up @@ -101,7 +102,7 @@ func runTests(t *testing.T, testCases []checkTest) {
entries, err := parseContent(tc.content)
require.NoError(t, err, "cannot parse rule content")
for _, entry := range entries {
problems := tc.checker(uri).Check(ctx, entry.Rule, entries)
problems := tc.checker(uri).Check(ctx, entry.Rule, tc.entries)
require.Equal(t, tc.problems(uri), problems)
}

Expand All @@ -124,7 +125,7 @@ func runTests(t *testing.T, testCases []checkTest) {
require.NoError(t, err, "cannot parse rule content")
t.Run(tc.description+" (bogus rules)", func(t *testing.T) {
for _, entry := range entries {
_ = tc.checker("").Check(ctx, entry.Rule, entries)
_ = tc.checker("").Check(ctx, entry.Rule, tc.entries)
}
})
}
Expand All @@ -148,6 +149,14 @@ func parseContent(content string) (entries []discovery.Entry, err error) {
return entries, nil
}

func mustParseContent(content string) (entries []discovery.Entry) {
entries, err := parseContent(content)
if err != nil {
panic(err)
}
return entries
}

func noProblems(uri string) []checks.Problem {
return nil
}
Expand Down
94 changes: 94 additions & 0 deletions internal/checks/promql_series.go
Expand Up @@ -3,6 +3,7 @@ package checks
import (
"context"
"fmt"
"strings"
"time"

"github.com/rs/zerolog/log"
Expand Down Expand Up @@ -73,6 +74,48 @@ func (c SeriesCheck) Check(ctx context.Context, rule parser.Rule, entries []disc
}
}

// 0. Special case for alert metrics
if metricName == "ALERTS" || metricName == "ALERTS_FOR_STATE" {
var alertname string
for _, lm := range selector.LabelMatchers {
if lm.Name == "alertname" && lm.Type != labels.MatchRegexp && lm.Type != labels.MatchNotRegexp {
alertname = lm.Value
}
}
var arEntry *discovery.Entry
if alertname != "" {
for _, entry := range entries {
entry := entry
if entry.Rule.AlertingRule != nil &&
entry.Rule.Error.Err == nil &&
entry.Rule.AlertingRule.Alert.Value.Value == alertname {
arEntry = &entry
break
}
}
if arEntry != nil {
log.Debug().Stringer("selector", &selector).Str("path", arEntry.Path).Msg("Metric is provided by alerting rule")
problems = append(problems, Problem{
Fragment: selector.String(),
Lines: expr.Lines(),
Reporter: c.Reporter(),
Text: fmt.Sprintf("%s metric is generated by alerts and found alerting rule named %q", selector.String(), alertname),
Severity: Information,
})
} else {
problems = append(problems, Problem{
Fragment: selector.String(),
Lines: expr.Lines(),
Reporter: c.Reporter(),
Text: fmt.Sprintf("%s metric is generated by alerts but didn't found any rule named %q", selector.String(), alertname),
Severity: Bug,
})
}
}
// ALERTS{} query with no alertname, all good
continue
}

labelNames := []string{}
for _, lm := range selector.LabelMatchers {
if lm.Name != labels.MetricName {
Expand Down Expand Up @@ -100,6 +143,45 @@ func (c SeriesCheck) Check(ctx context.Context, rule parser.Rule, entries []disc
continue
}
if len(trs.ranges) == 0 {
// Check if we have recording rule that provides this metric before we give up
var rrEntry *discovery.Entry
for _, entry := range entries {
entry := entry
if entry.Rule.RecordingRule != nil &&
entry.Rule.Error.Err == nil &&
entry.Rule.RecordingRule.Record.Value.Value == bareSelector.String() {
rrEntry = &entry
break
}
}
if rrEntry != nil {
// Validate recording rule instead
log.Debug().Stringer("selector", &bareSelector).Str("path", rrEntry.Path).Msg("Metric is provided by recording rule")
problems = append(problems, Problem{
Fragment: bareSelector.String(),
Lines: expr.Lines(),
Reporter: c.Reporter(),
Text: fmt.Sprintf("%s didn't have any series for %q metric in the last %s but found recording rule that generates it, "+
"pint will try to use source recording rule queries to validate selectors in this query but it might be less accurate",
promText(c.prom.Name(), trs.uri), bareSelector.String(), trs.sinceDesc(trs.from)),
Severity: Information,
})

q := wrapLabelReplaces(rrEntry.Rule.RecordingRule.Expr.Value.Value, selector.LabelMatchers)
count, _, err := c.instantSeriesCount(ctx, q)
if err != nil {
problems = append(problems, c.queryProblem(err, selector.String(), expr))
continue
}
if count > 0 {
log.Debug().Str("check", c.Reporter()).Stringer("selector", &selector).Msg("Found fallback series from recording rule, skipping further checks")
continue
}

// if it's not there recurse checks onto it
problems = append(problems, c.Check(ctx, rrEntry.Rule, entries)...)
}

problems = append(problems, Problem{
Fragment: bareSelector.String(),
Lines: expr.Lines(),
Expand Down Expand Up @@ -435,3 +517,15 @@ func (tr timeRanges) sinceDesc(t time.Time) (s string) {
}
return output.HumanizeDuration(dur.Round(time.Minute))
}

func wrapLabelReplaces(q string, lms []*labels.Matcher) string {
out := "vector(1)"
onLabels := []string{}
for _, lm := range lms {
if lm.Name != labels.MetricName && lm.Type == labels.MatchEqual {
out = fmt.Sprintf(`label_replace(%s, "%s", "%s", "", "")`, out, lm.Name, lm.Value)
onLabels = append(onLabels, lm.Name)
}
}
return fmt.Sprintf(`count(%s AND on(%s) %s)`, q, strings.Join(onLabels, ","), out)
}

0 comments on commit f1f4cdd

Please sign in to comment.