Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[v1.15] metrics: prepare to increase cardinality of BPF metrics key #31558

Merged
merged 1 commit into from
Mar 27, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
1 change: 1 addition & 0 deletions bugtool/cmd/configuration.go
Original file line number Diff line number Diff line change
Expand Up @@ -384,6 +384,7 @@ func copyCiliumInfoCommands(cmdDir string, k8sPods []string) []string {
ciliumCommands := []string{
fmt.Sprintf("cilium-dbg debuginfo --output=markdown,json -f --output-directory=%s", cmdDir),
"cilium-dbg metrics list",
"cilium-dbg bpf metrics list",
"cilium-dbg fqdn cache list",
"cilium-dbg config -a",
"cilium-dbg encrypt status",
Expand Down
70 changes: 41 additions & 29 deletions cilium-dbg/cmd/bpf_metrics_list.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,25 +26,21 @@ const (
)

type metricsRow struct {
reasonCode int
reasonCode uint8
reasonDesc string
direction string
packets int
bytes int
}

type jsonMetricValues struct {
Packets uint64 `json:"packets"`
Bytes uint64 `json:"bytes"`
packets uint64
bytes uint64
}

type jsonMetric struct {
Reason uint64 `json:"reason"`
Description string `json:"description"`
Values map[string]jsonMetricValues `json:"values"`
Reason string `json:"reason"`
Direction string `json:"direction"`
Packets uint64 `json:"packets"`
Bytes uint64 `json:"bytes"`
}

type jsonMetrics []jsonMetric
type jsonMetrics []*jsonMetric

var bpfMetricsListCmd = &cobra.Command{
Use: "list",
Expand Down Expand Up @@ -81,31 +77,41 @@ func listJSONMetrics(bpfMetricsList []*metricsRow) {
return
}

metricsByReason := map[int]jsonMetric{}
// All keys in the metrics map that have these fields in common will have
// their byte and packet counters summed and presented as a single metric.
// This is to allow newer Cilium versions to make use of the reserved bits
// in the metricsmap key without breaking older versions of the agent. From
// the old agent's perspective, this would cause duplicate metrics to appear.
type key struct {
reason string
direction string
}

metrics := make(map[key]*jsonMetric)

for _, row := range bpfMetricsList {
if _, ok := metricsByReason[row.reasonCode]; !ok {
metricsByReason[row.reasonCode] = jsonMetric{
Reason: uint64(row.reasonCode),
Description: monitorAPI.DropReason(uint8(row.reasonCode)),
Values: map[string]jsonMetricValues{},
}
k := key{
reason: monitorAPI.DropReason(row.reasonCode),
direction: strings.ToLower(row.direction),
}

direction := strings.ToLower(row.direction)

metricsByReason[row.reasonCode].Values[direction] = jsonMetricValues{
Packets: uint64(row.packets),
Bytes: uint64(row.bytes),
if _, ok := metrics[k]; !ok {
metrics[k] = &jsonMetric{
Reason: monitorAPI.DropReason(row.reasonCode),
Direction: strings.ToLower(row.direction),
}
}

metrics[k].Packets += row.packets
metrics[k].Bytes += row.bytes
}

metrics := jsonMetrics{}
for _, v := range metricsByReason {
metrics = append(metrics, v)
var out jsonMetrics
for _, v := range metrics {
out = append(out, v)
}

if err := command.PrintOutput(metrics); err != nil {
if err := command.PrintOutput(out); err != nil {
fmt.Fprintf(os.Stderr, "error getting output of map in %s: %s\n", command.OutputOptionString(), err)
os.Exit(1)
}
Expand Down Expand Up @@ -147,7 +153,13 @@ func listHumanReadableMetrics(bpfMetricsList []*metricsRow) {
}

func extractRow(key *metricsmap.Key, values *metricsmap.Values) *metricsRow {
return &metricsRow{int(key.Reason), key.DropForwardReason(), key.Direction(), int(values.Count()), int(values.Bytes())}
return &metricsRow{
key.Reason,
key.DropForwardReason(),
key.Direction(),
values.Count(),
values.Bytes(),
}
}

func init() {
Expand Down
66 changes: 33 additions & 33 deletions cilium-dbg/cmd/bpf_metrics_list_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -36,42 +36,46 @@ func (s *BPFMetricsMapSuite) TestDumpMetrics(c *C) {
Key: metricsmap.Key{Reason: 132, Dir: 2},
Values: metricsmap.Values{{Count: 300, Bytes: 3000}},
},
// 'Duplicate' metric that had reserved bits of its key utilized in a
// newer version of Cilium. This should result in counters being summed
// with other keys with matching known fields. For example, Cilium 1.16
// adds line and file info to each metric, which older versions will
// ignore. In this case, all keys with the same reason and direction
// should be summed and presented as a single metric.
{
Key: metricsmap.Key{Reason: 132, Dir: 2},
Values: metricsmap.Values{{Count: 1, Bytes: 1}},
},
},
),
}

desc := func(x int) string {
reason := func(x int) string {
return monitorAPI.DropReason(uint8(x))
}

dir := func(d int) string {
return strings.ToLower(metricsmap.MetricDirection(uint8(d)))
}

jsonEncodedMetricsMap := jsonMetrics{
jsonMetric{
Reason: 0,
Description: desc(0),
Values: map[string]jsonMetricValues{
dir(1): {
Packets: 100,
Bytes: 1000,
},
dir(2): {
Packets: 200,
Bytes: 2000,
},
},
want := jsonMetrics{
{
Reason: reason(0),
Direction: dir(1),
Packets: 100,
Bytes: 1000,
},
jsonMetric{
Reason: 132,
Description: desc(132),
Values: map[string]jsonMetricValues{
dir(2): {
Packets: 300,
Bytes: 3000,
},
},
{
Reason: reason(0),
Direction: dir(2),
Packets: 200,
Bytes: 2000,
},
{
Reason: reason(132),
Direction: dir(2),
Packets: 301,
Bytes: 3001,
},
}

Expand All @@ -81,17 +85,13 @@ func (s *BPFMetricsMapSuite) TestDumpMetrics(c *C) {
}
}, c)

var jsonEncodedMetricsMapDump jsonMetrics
err := json.Unmarshal([]byte(rawDump), &jsonEncodedMetricsMapDump)
var got jsonMetrics
err := json.Unmarshal([]byte(rawDump), &got)
c.Assert(err, IsNil, Commentf("invalid JSON output: '%s', '%s'", err, rawDump))

sort.Slice(jsonEncodedMetricsMap, func(i, j int) bool {
return jsonEncodedMetricsMap[i].Reason <= jsonEncodedMetricsMap[j].Reason
})

sort.Slice(jsonEncodedMetricsMapDump, func(i, j int) bool {
return jsonEncodedMetricsMapDump[i].Reason <= jsonEncodedMetricsMapDump[j].Reason
sort.Slice(got, func(i, j int) bool {
return got[i].Packets <= got[j].Packets
})

c.Assert(jsonEncodedMetricsMap, checker.DeepEquals, jsonEncodedMetricsMapDump)
c.Assert(want, checker.DeepEquals, got)
}
78 changes: 50 additions & 28 deletions pkg/maps/metricsmap/metricsmap.go
Original file line number Diff line number Diff line change
Expand Up @@ -158,24 +158,10 @@ type metricsmapCollector struct {
droppedByteDesc *prometheus.Desc
forwardCountDesc *prometheus.Desc
forwardByteDesc *prometheus.Desc

// eBPF code seems to expose multiple reasons for forwarded metrics
// as opposed to what is stated in bpf/lib/metrics.h comments.
// IterateWithCallback iterates through BPF map for each reason and direction.
// Since we do not have "reason" label on forwarded metrics, we would end up collecting
// same forwarded metric multiple times which is not allowed by prometheus client.
// See https://github.com/prometheus/client_golang/issues/242
//
// promMetrics is a generic map used to sum all values by desired set of labels
// for both forwarded and dropped metrics
forwardedMetricsMap promMetrics[forwardLabels]
droppedMetricsMap promMetrics[dropLabels]
}

func newMetricsMapCollector() prometheus.Collector {
return &metricsmapCollector{
droppedMetricsMap: make(map[dropLabels]metricValues),
forwardedMetricsMap: make(map[forwardLabels]metricValues),
droppedByteDesc: prometheus.NewDesc(
prometheus.BuildFQName(metrics.Namespace, "", "drop_bytes_total"),
"Total dropped bytes, tagged by drop reason and ingress/egress direction",
Expand Down Expand Up @@ -215,14 +201,24 @@ type metricValues struct {

type labels comparable

type promMetrics[k labels] map[k]metricValues

func (p promMetrics[k]) upsert(labels k, values *Values) {
// promMetrics is used to sum values by a desired set of labels for both
// forwarded and dropped metrics.
type promMetrics[k labels] map[k]*metricValues

// sum accumulates a value for the given label set k and stores it in p. Can be
// called multiple times with the same label set.
//
// values is a row from the metrics map, a per-cpu data structure. All entries
// in the row are summed, and the result is added to any preexisting values
// belonging to the label set.
func (p promMetrics[k]) sum(labels k, values *Values) {
if v, ok := p[labels]; ok {
v.bytes = float64(values.Bytes())
v.count = float64(values.Count())
v.bytes += float64(values.Bytes())
v.count += float64(values.Count())
return
}
p[labels] = metricValues{

p[labels] = &metricValues{
bytes: float64(values.Bytes()),
count: float64(values.Count()),
}
Expand All @@ -232,32 +228,58 @@ func (mc *metricsmapCollector) Collect(ch chan<- prometheus.Metric) {
mc.mutex.Lock()
defer mc.mutex.Unlock()

// The datapath knows many reasons for forwarding or dropping a packet. All
// packet metrics carry a direction label, and forwarded packets can carry
// either the 'success' or 'interface' forward reason depending on where it
// came in.
//
// Drop metrics carry direction and one of many possible drop reasons.
//
// Since Cilium 1.16, the underlying metrics map contains line/file
// information for all metrics to enable troubleshooting. We don't expose
// these as labels through the /metrics endpoint to keep cardinality low and
// to avoid breaking user queries and recording rules. `cilium-dbg bpf metrics
// list` always shows all properties and is included in sysdumps.
//
// The code below first generates a label set, typically a subset of the
// members of the metrics key, and sums up all byte/packet counters matching
// the label set. This accounts for future versions of Cilium adding new
// fields, causing surprising behaviour without the summing logic in place in
// case the agent is downgraded. From the perspective of the downgraded agent,
// this will cause multiple identical metrics to appear with different values.
// The Prometheus library rejects metrics with duplicate label sets.

drop := make(promMetrics[dropLabels])
fwd := make(promMetrics[forwardLabels])

err := Metrics.IterateWithCallback(func(key *Key, values *Values) {
if key.IsDrop() {
labelSet := dropLabels{
direction: key.Direction(),
reason: key.DropForwardReason(),
}
mc.droppedMetricsMap.upsert(labelSet, values)
} else {
labelSet := forwardLabels{
direction: key.Direction(),
}
mc.forwardedMetricsMap.upsert(labelSet, values)
drop.sum(labelSet, values)

return
}

labelSet := forwardLabels{
direction: key.Direction(),
}
fwd.sum(labelSet, values)
})
if err != nil {
log.WithError(err).Warn("Failed to read metrics from BPF map")
// Do not update partial metrics
return
}

for labels, value := range mc.forwardedMetricsMap {
for labels, value := range fwd {
mc.updateCounterMetric(mc.forwardCountDesc, ch, value.count, labels.direction)
mc.updateCounterMetric(mc.forwardByteDesc, ch, value.bytes, labels.direction)
}

for labels, value := range mc.droppedMetricsMap {
for labels, value := range drop {
mc.updateCounterMetric(mc.droppedCountDesc, ch, value.count, labels.reason, labels.direction)
mc.updateCounterMetric(mc.droppedByteDesc, ch, value.bytes, labels.reason, labels.direction)
}
Expand Down
2 changes: 1 addition & 1 deletion test/helpers/wrappers.go
Original file line number Diff line number Diff line change
Expand Up @@ -210,7 +210,7 @@ func OpenSSLShowCerts(host string, port uint16, serverName string) string {
// GetBPFPacketsCount returns the number of packets for a given drop reason and
// direction by parsing BPF metrics.
func GetBPFPacketsCount(kubectl *Kubectl, pod, reason, direction string) (int, error) {
cmd := fmt.Sprintf("cilium-dbg bpf metrics list -o json | jq '.[] | select(.description == \"%s\").values.%s.packets'", reason, direction)
cmd := fmt.Sprintf("cilium-dbg bpf metrics list -o json | jq '[.[] | select(.reason == \"%s\") | select(.direction == \"%s\").packets] | add'", reason, direction)

res := kubectl.CiliumExecMustSucceed(context.TODO(), pod, cmd)

Expand Down