Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: mixin generation when cluster label is changed #12613

Merged
merged 5 commits into from
Apr 19, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions production/loki-mixin/alerts.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -57,17 +57,17 @@
{
alert: 'LokiTooManyCompactorsRunning',
expr: |||
sum(loki_boltdb_shipper_compactor_running) by (namespace, cluster) > 1
|||,
sum(loki_boltdb_shipper_compactor_running) by (namespace, %s) > 1
||| % $._config.per_cluster_label,
'for': '5m',
labels: {
severity: 'warning',
},
annotations: {
summary: 'Loki deployment is running more than one compactor.',
description: |||
description: std.strReplace(|||
{{ $labels.cluster }} {{ $labels.namespace }} has had {{ printf "%.0f" $value }} compactors running for more than 5m. Only one compactor should run at a time.
|||,
|||, 'cluster', $._config.per_cluster_label),
},
},
],
Expand Down
48 changes: 24 additions & 24 deletions production/loki-mixin/dashboards/loki-canary-dashboard.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -24,16 +24,16 @@ local grafana = import 'grafonnet/grafana.libsonnet';
// This logic is inherited from mimir-mixin.
dashboard.dashboard('Canary')
// We can't make use of simplified template selectors from the loki dashboard utils until we port the cortex dashboard utils panel/grid functionality.
.addTemplate('cluster', 'loki_build_info', 'cluster')
.addTemplate('namespace', 'loki_build_info{cluster=~"$cluster"}', 'namespace')
.addTemplate('cluster', 'loki_build_info', $._config.per_cluster_label)
.addTemplate('namespace', 'loki_build_info{' + $._config.per_cluster_label + '=~"$cluster"}', 'namespace')
+ {
// This dashboard uses the new grid system in order to place panels (using gridPos).
// Because of this we can't use the mixin's addRow() and addPanel().
schemaVersion: 27,
rows: null,
// ugly hack, copy pasta the tag/link
// code from the loki-mixin
tags: ['loki'],
tags: $._config.tags,
links: [
{
asDropdown: true,
Expand All @@ -49,60 +49,60 @@ local grafana = import 'grafonnet/grafana.libsonnet';
panels: [
// grid row 1
dashboard.panel('Canary Entries Total') +
dashboard.newStatPanel('sum(count(loki_canary_entries_total{cluster=~"$cluster",namespace=~"$namespace"}))', unit='short') +
dashboard.newStatPanel('sum(count(loki_canary_entries_total{' + $._config.per_cluster_label + '=~"$cluster", namespace=~"$namespace"}))', unit='short') +
{ gridPos: { h: 4, w: 3, x: 0, y: 0 } },

dashboard.panel('Canary Logs Total') +
dashboard.newStatPanel('sum(increase(loki_canary_entries_total{cluster=~"$cluster",namespace=~"$namespace"}[$__range]))', unit='short') +
dashboard.newStatPanel('sum(increase(loki_canary_entries_total{' + $._config.per_cluster_label + '=~"$cluster",namespace=~"$namespace"}[$__range]))', unit='short') +
{ gridPos: { h: 4, w: 3, x: 3, y: 0 } },

dashboard.panel('Missing') +
dashboard.newStatPanel('sum(increase(loki_canary_missing_entries_total{cluster=~"$cluster",namespace=~"$namespace"}[$__range]))', unit='short') +
dashboard.newStatPanel('sum(increase(loki_canary_missing_entries_total{' + $._config.per_cluster_label + '=~"$cluster",namespace=~"$namespace"}[$__range]))', unit='short') +
{ gridPos: { h: 4, w: 3, x: 6, y: 0 } },

dashboard.panel('Spotcheck Missing') +
dashboard.newStatPanel('sum(increase(loki_canary_spot_check_missing_entries_total{cluster=~"$cluster",namespace=~"$namespace"}[$__range]))', unit='short') +
dashboard.newStatPanel('sum(increase(loki_canary_spot_check_missing_entries_total{' + $._config.per_cluster_label + '=~"$cluster",namespace=~"$namespace"}[$__range]))', unit='short') +
{ gridPos: { h: 4, w: 3, x: 9, y: 0 } },

// grid row 2
dashboard.panel('Spotcheck Total') +
dashboard.newStatPanel('sum(increase(loki_canary_spot_check_entries_total{cluster=~"$cluster",namespace=~"$namespace"}[$__range]))', unit='short') +
dashboard.newStatPanel('sum(increase(loki_canary_spot_check_entries_total{' + $._config.per_cluster_label + '=~"$cluster",namespace=~"$namespace"}[$__range]))', unit='short') +
{ gridPos: { h: 4, w: 3, x: 0, y: 4 } },

dashboard.panel('Metric Test Error %') +
dashboard.newStatPanel('((sum(loki_canary_metric_test_expected{cluster=~"$cluster",namespace=~"$namespace"}) - sum(loki_canary_metric_test_actual{cluster=~"$cluster",namespace=~"$namespace"}))/(sum(loki_canary_metric_test_actual{cluster=~"$cluster",namespace=~"$namespace"}))) * 100') +
dashboard.newStatPanel('((sum(loki_canary_metric_test_expected{' + $._config.per_cluster_label + '=~"$cluster",namespace=~"$namespace"}) - sum(loki_canary_metric_test_actual{' + $._config.per_cluster_label + '=~"$cluster",namespace=~"$namespace"}))/(sum(loki_canary_metric_test_actual{' + $._config.per_cluster_label + '=~"$cluster",namespace=~"$namespace"}))) * 100') +
{ gridPos: { h: 4, w: 3, x: 3, y: 4 } },

dashboard.panel('Missing %') +
dashboard.newStatPanel('(sum(increase(loki_canary_missing_entries_total{cluster=~"$cluster",namespace=~"$namespace"}[$__range]))/sum(increase(loki_canary_entries_total{cluster=~"$cluster",namespace=~"$namespace"}[$__range])))*100') +
dashboard.newStatPanel('(sum(increase(loki_canary_missing_entries_total{' + $._config.per_cluster_label + '=~"$cluster",namespace=~"$namespace"}[$__range]))/sum(increase(loki_canary_entries_total{' + $._config.per_cluster_label + '=~"$cluster",namespace=~"$namespace"}[$__range])))*100') +
{ gridPos: { h: 4, w: 3, x: 6, y: 4 } },

dashboard.panel('Spotcheck Missing %') +
dashboard.newStatPanel('(sum(increase(loki_canary_spot_check_missing_entries_total{cluster=~"$cluster",namespace=~"$namespace"}[$__range]))/sum(increase(loki_canary_spot_check_entries_total{cluster=~"$cluster",namespace=~"$namespace"}[$__range]))) * 100') +
dashboard.newStatPanel('(sum(increase(loki_canary_spot_check_missing_entries_total{' + $._config.per_cluster_label + '=~"$cluster",namespace=~"$namespace"}[$__range]))/sum(increase(loki_canary_spot_check_entries_total{' + $._config.per_cluster_label + '=~"$cluster",namespace=~"$namespace"}[$__range]))) * 100') +
{ gridPos: { h: 4, w: 3, x: 9, y: 4 } },

// grid row 3
dashboard.panel('Metric Test Expected') +
dashboard.newStatPanel('sum(loki_canary_metric_test_expected{cluster=~"$cluster",namespace=~"$namespace"})', unit='short') +
dashboard.newStatPanel('sum(loki_canary_metric_test_expected{' + $._config.per_cluster_label + '=~"$cluster",namespace=~"$namespace"})', unit='short') +
{ gridPos: { h: 4, w: 3, x: 0, y: 8 } },

dashboard.panel('Metric Test Actual') +
dashboard.newStatPanel('sum(loki_canary_metric_test_actual{cluster=~"$cluster",namespace=~"$namespace"})', unit='short') +
dashboard.newStatPanel('sum(loki_canary_metric_test_actual{' + $._config.per_cluster_label + '=~"$cluster",namespace=~"$namespace"})', unit='short') +
{ gridPos: { h: 4, w: 3, x: 3, y: 8 } },

dashboard.panel('Websocket Missing') +
dashboard.newStatPanel('sum(increase(loki_canary_websocket_missing_entries_total{cluster=~"$cluster",namespace=~"$namespace"}[$__range]))', unit='short') +
dashboard.newStatPanel('sum(increase(loki_canary_websocket_missing_entries_total{' + $._config.per_cluster_label + '=~"$cluster",namespace=~"$namespace"}[$__range]))', unit='short') +
{ gridPos: { h: 4, w: 3, x: 6, y: 8 } },

dashboard.panel('Websocket Missing %') +
dashboard.newStatPanel('(sum(increase(loki_canary_websocket_missing_entries_total{cluster=~"$cluster",namespace=~"$namespace"}[$__range]))/sum(increase(loki_canary_entries_total{cluster=~"$cluster",namespace=~"$namespace"}[$__range])))*100') +
dashboard.newStatPanel('(sum(increase(loki_canary_websocket_missing_entries_total{' + $._config.per_cluster_label + '=~"$cluster",namespace=~"$namespace"}[$__range]))/sum(increase(loki_canary_entries_total{' + $._config.per_cluster_label + '=~"$cluster",namespace=~"$namespace"}[$__range])))*100') +
{ gridPos: { h: 4, w: 3, x: 9, y: 8 } },
// end of grid

dashboard.panel('Log Write to read Latency Percentiles') +
dashboard.queryPanel([
'histogram_quantile(0.95, sum(rate(loki_canary_response_latency_seconds_bucket{cluster=~"$cluster",namespace=~"$namespace"}[$__rate_interval])) by (le))',
'histogram_quantile(0.50, sum(rate(loki_canary_response_latency_seconds_bucket{cluster=~"$cluster",namespace=~"$namespace"}[$__rate_interval])) by (le))',
'histogram_quantile(0.95, sum(rate(loki_canary_response_latency_seconds_bucket{' + $._config.per_cluster_label + '=~"$cluster",namespace=~"$namespace"}[$__rate_interval])) by (le))',
'histogram_quantile(0.50, sum(rate(loki_canary_response_latency_seconds_bucket{' + $._config.per_cluster_label + '=~"$cluster",namespace=~"$namespace"}[$__rate_interval])) by (le))',
], ['p95', 'p50']) +
{ gridPos: { h: 6, w: 12, x: 12, y: 0 } },

Expand All @@ -115,7 +115,7 @@ local grafana = import 'grafonnet/grafana.libsonnet';
).addTargets(
[
grafana.prometheus.target(
'sum(rate(loki_canary_response_latency_seconds_bucket{cluster=~"$cluster",namespace=~"$namespace"}[$__rate_interval])) by (le)',
'sum(rate(loki_canary_response_latency_seconds_bucket{' + $._config.per_cluster_label + '=~"$cluster",namespace=~"$namespace"}[$__rate_interval])) by (le)',
legendFormat='{{le}}',
format='heatmap',
),
Expand All @@ -125,24 +125,24 @@ local grafana = import 'grafonnet/grafana.libsonnet';

dashboard.panel('Spot Check Query') +
dashboard.queryPanel([
'histogram_quantile(0.99, sum(rate(loki_canary_spot_check_request_duration_seconds_bucket{cluster=~"$cluster",namespace=~"$namespace"}[$__rate_interval])) by (le))',
'histogram_quantile(0.50, sum(rate(loki_canary_spot_check_request_duration_seconds_bucket{cluster=~"$cluster",namespace=~"$namespace"}[$__rate_interval])) by (le))',
'histogram_quantile(0.99, sum(rate(loki_canary_spot_check_request_duration_seconds_bucket{' + $._config.per_cluster_label + '=~"$cluster",namespace=~"$namespace"}[$__rate_interval])) by (le))',
'histogram_quantile(0.50, sum(rate(loki_canary_spot_check_request_duration_seconds_bucket{' + $._config.per_cluster_label + '=~"$cluster",namespace=~"$namespace"}[$__rate_interval])) by (le))',
], ['p99', 'p95']) +
{ gridPos: { h: 6, w: 12, x: 0, y: 14 } },

dashboard.panel('Metric Test Query') +
dashboard.queryPanel([
'histogram_quantile(0.99, sum(rate(loki_canary_metric_test_request_duration_seconds_bucket{cluster=~"$cluster",namespace=~"$namespace"}[15m])) by (le))',
'histogram_quantile(0.50, sum(rate(loki_canary_metric_test_request_duration_seconds_bucket{cluster=~"$cluster",namespace=~"$namespace"}[15m])) by (le))',
'histogram_quantile(0.99, sum(rate(loki_canary_metric_test_request_duration_seconds_bucket{' + $._config.per_cluster_label + '=~"$cluster",namespace=~"$namespace"}[15m])) by (le))',
'histogram_quantile(0.50, sum(rate(loki_canary_metric_test_request_duration_seconds_bucket{' + $._config.per_cluster_label + '=~"$cluster",namespace=~"$namespace"}[15m])) by (le))',
], ['p99', 'p95'],) +
{ gridPos: { h: 6, w: 12, x: 12, y: 14 } },

dashboard.panel('Spot Check Missing %') +
dashboard.queryPanel('topk(20, (sum by (cluster, pod) (increase(loki_canary_spot_check_missing_entries_total{cluster=~"$cluster",namespace=~"$namespace"}[$__rate_interval]))/sum by (cluster, pod) (increase(loki_canary_spot_check_entries_total{cluster=~"$cluster",namespace=~"$namespace"}[$__rate_interval])) * 100)) > 0', '') +
dashboard.queryPanel('topk(20, (sum by (' + $._config.per_cluster_label + ', pod) (increase(loki_canary_spot_check_missing_entries_total{' + $._config.per_cluster_label + '=~"$cluster",namespace=~"$namespace"}[$__rate_interval]))/sum by (' + $._config.per_cluster_label + ', pod) (increase(loki_canary_spot_check_entries_total{' + $._config.per_cluster_label + '=~"$cluster",namespace=~"$namespace"}[$__rate_interval])) * 100)) > 0', '') +
{ gridPos: { h: 6, w: 12, x: 0, y: 20 } },

g.panel('Missing logs') +
g.queryPanel('topk(20,(sum by (cluster, pod)(increase(loki_canary_missing_entries_total{cluster=~"$cluster",namespace=~"$namespace"}[$__rate_interval]))/sum by (cluster, pod)(increase(loki_canary_entries_total{cluster=~"$cluster",namespace=~"$namespace"}[$__rate_interval])))*100) > 0', 'Missing {{ cluster }} {{ pod }}') +
g.queryPanel('topk(20,(sum by (' + $._config.per_cluster_label + ', pod)(increase(loki_canary_missing_entries_total{' + $._config.per_cluster_label + '=~"$cluster",namespace=~"$namespace"}[$__rate_interval]))/sum by (' + $._config.per_cluster_label + ', pod)(increase(loki_canary_entries_total{' + $._config.per_cluster_label + '=~"$cluster",namespace=~"$namespace"}[$__rate_interval])))*100) > 0', 'Missing {{ ' + $._config.per_cluster_label + ' }} {{ pod }}') +
{ gridPos: { h: 6, w: 12, x: 12, y: 20 } },

],
Expand Down
6 changes: 3 additions & 3 deletions production/loki-mixin/dashboards/loki-logs.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,6 @@ local template = import 'grafonnet/template.libsonnet';
local cfg = self,

showMultiCluster:: true,
clusterLabel:: $._config.per_cluster_label,

} + lokiLogs +
$.dashboard('Loki / Logs', uid='logs')
Expand All @@ -61,8 +60,9 @@ local template = import 'grafonnet/template.libsonnet';
p {
targets: [
e {
expr: if dashboards['loki-logs.json'].showMultiCluster then super.expr
else std.strReplace(super.expr, $._config.per_cluster_label + '="$cluster", ', ''),
expr: if dashboards['loki-logs.json'].showMultiCluster
then std.strReplace(super.expr, 'cluster="$cluster"', $._config.per_cluster_label + '="$cluster"')
else std.strReplace(super.expr, 'cluster="$cluster", ', ''),
}
for e in p.targets
],
Expand Down
26 changes: 20 additions & 6 deletions production/loki-mixin/dashboards/loki-operational.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@ local utils = import 'mixin-utils/utils.libsonnet';
showAnnotations:: true,
showLinks:: true,
showMultiCluster:: true,
clusterLabel:: $._config.per_cluster_label,

hiddenRows:: [
'Cassandra',
Expand Down Expand Up @@ -62,7 +61,22 @@ local utils = import 'mixin-utils/utils.libsonnet';

local replaceClusterMatchers(expr) =
if dashboards['loki-operational.json'].showMultiCluster
then expr
// Replace the recording rules cluster label with the per-cluster label
then std.strReplace(
// Replace the cluster label for equality matchers with the per-cluster label
std.strReplace(
// Replace the cluster label for regex matchers with the per-cluster label
std.strReplace(
expr,
'cluster=~"$cluster"',
$._config.per_cluster_label + '=~"$cluster"'
),
'cluster="$cluster"',
$._config.per_cluster_label + '="$cluster"'
),
'cluster_',
$._config.per_cluster_label + '_'
)
Comment on lines +64 to +79
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

just noting that having to do something like this is yet another reason for us to try and get rid of these giant copy/paste json dashboards at some point

else
std.strReplace(
std.strReplace(
Expand Down Expand Up @@ -143,7 +157,7 @@ local utils = import 'mixin-utils/utils.libsonnet';


local replaceAllMatchers(expr) =
replaceMatchers(replaceClusterMatchers(expr)),
replaceMatchers(expr),

local selectDatasource(ds) =
if ds == null || ds == '' then ds
Expand Down Expand Up @@ -179,7 +193,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
datasource: selectDatasource(super.datasource),
targets: if std.objectHas(p, 'targets') then [
e {
expr: removeInternalComponents(p.title, e.expr),
expr: removeInternalComponents(p.title, replaceClusterMatchers(e.expr)),
}
for e in p.targets
] else [],
Expand All @@ -188,7 +202,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
datasource: selectDatasource(super.datasource),
targets: if std.objectHas(sp, 'targets') then [
e {
expr: removeInternalComponents(p.title, e.expr),
expr: removeInternalComponents(p.title, replaceClusterMatchers(e.expr)),
}
for e in sp.targets
] else [],
Expand All @@ -197,7 +211,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
datasource: selectDatasource(super.datasource),
targets: if std.objectHas(ssp, 'targets') then [
e {
expr: removeInternalComponents(p.title, e.expr),
expr: removeInternalComponents(p.title, replaceClusterMatchers(e.expr)),
}
for e in ssp.targets
] else [],
Expand Down
3 changes: 1 addition & 2 deletions production/loki-mixin/dashboards/loki-reads.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -31,10 +31,9 @@ local utils = import 'mixin-utils/utils.libsonnet';
local cfg = self,

showMultiCluster:: true,
clusterLabel:: $._config.per_cluster_label,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

same here and others

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same as above, I did not see it being used

clusterMatchers::
if cfg.showMultiCluster then
[utils.selector.re(cfg.clusterLabel, '$cluster')]
[utils.selector.re($._config.per_cluster_label, '$cluster')]
else
[],

Expand Down
3 changes: 1 addition & 2 deletions production/loki-mixin/dashboards/loki-writes.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,9 @@ local utils = import 'mixin-utils/utils.libsonnet';
local cfg = self,

showMultiCluster:: true,
clusterLabel:: $._config.per_cluster_label,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why are we removing this?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I removed it because it is unused

clusterMatchers::
if cfg.showMultiCluster then
[utils.selector.re(cfg.clusterLabel, '$cluster')]
[utils.selector.re($._config.per_cluster_label, '$cluster')]
else
[],

Expand Down
Loading
Loading