Skip to content

Commit

Permalink
Add container out of memory metrics
Browse files Browse the repository at this point in the history
This patch adds a new metric
`container_runtime_crio_containers_oom_total` and
`container_runtime_crio_containers_oom`, which collects out of memory
(oom) metrics global and by container name. This also includes
sandboxes, since we reuse the CRI-O internal name for every container.

Signed-off-by: Sascha Grunert <sgrunert@redhat.com>
  • Loading branch information
saschagrunert authored and haircommander committed Mar 4, 2022
1 parent 33afe03 commit 17b8f71
Show file tree
Hide file tree
Showing 7 changed files with 136 additions and 46 deletions.
12 changes: 12 additions & 0 deletions internal/oci/runtime_oci.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ import (
"github.com/cri-o/cri-o/internal/log"
"github.com/cri-o/cri-o/pkg/config"
types "github.com/cri-o/cri-o/server/cri/types"
"github.com/cri-o/cri-o/server/metrics"
"github.com/cri-o/cri-o/utils"
"github.com/cri-o/cri-o/utils/cmdrunner"
"github.com/fsnotify/fsnotify"
Expand Down Expand Up @@ -891,6 +892,17 @@ func (r *runtimeOCI) UpdateContainerStatus(c *Container) error {
oomFilePath := filepath.Join(c.bundlePath, "oom")
if _, err = os.Stat(oomFilePath); err == nil {
c.state.OOMKilled = true

// Collect total metric
metrics.CRIOContainersOOMTotal.Inc()

// Collect metric by container name
counter, err := metrics.CRIOContainersOOM.GetMetricWithLabelValues(c.Name())
if err != nil {
log.Warnf(ctx, "Unable to write OOM metric by container: %v", err)
} else {
counter.Inc()
}
}

return nil
Expand Down
16 changes: 14 additions & 2 deletions internal/oci/runtime_vm.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ import (
"github.com/containerd/ttrpc"
"github.com/containers/libpod/v2/pkg/cgroups"
types "github.com/cri-o/cri-o/server/cri/types"
"github.com/cri-o/cri-o/server/metrics"
"github.com/cri-o/cri-o/utils"
"github.com/cri-o/cri-o/utils/errdefs"
"github.com/cri-o/cri-o/utils/fifo"
Expand Down Expand Up @@ -662,6 +663,17 @@ func (r *runtimeVM) updateContainerStatus(c *Container) error {
oomFilePath := filepath.Join(c.bundlePath, "oom")
if _, err = os.Stat(oomFilePath); err == nil {
c.state.OOMKilled = true

// Collect total metric
metrics.CRIOContainersOOMTotal.Inc()

// Collect metric by container name
counter, err := metrics.CRIOContainersOOM.GetMetricWithLabelValues(c.Name())
if err != nil {
logrus.Warnf("Unable to write OOM metric by container: %v", err)
} else {
counter.Inc()
}
}
}
return nil
Expand Down Expand Up @@ -727,12 +739,12 @@ func (r *runtimeVM) ContainerStats(c *Container, _ string) (*ContainerStats, err
return nil, errors.Wrap(err, "failed to extract container metrics")
}

metrics, ok := stats.(*cgroups.Metrics)
m, ok := stats.(*cgroups.Metrics)
if !ok {
return nil, errors.Errorf("Unknown stats type %T", stats)
}

return metricsToCtrStats(c, metrics), nil
return metricsToCtrStats(c, m), nil
}

// SignalContainer sends a signal to a container process.
Expand Down
27 changes: 27 additions & 0 deletions server/metrics/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,12 @@ const (
// CRIOImageLayerReuseKey is the key for the CRI-O image layer reuse metrics.
CRIOImageLayerReuseKey = "crio_image_layer_reuse"

// CRIOContainersOOMTotalKey is the key for the total CRI-O container out of memory metrics.
CRIOContainersOOMTotalKey = "crio_containers_oom_total"

// CRIOContainersOOMKey is the key for the CRI-O container out of memory metrics per container name.
CRIOContainersOOMKey = "crio_containers_oom"

subsystem = "container_runtime"
)

Expand Down Expand Up @@ -145,6 +151,25 @@ var (
},
[]string{"name"},
)

// CRIOContainersOOMTotal collects container out of memory (oom) metrics for every container and sandboxes.
CRIOContainersOOMTotal = prometheus.NewCounter(
prometheus.CounterOpts{
Subsystem: subsystem,
Name: CRIOContainersOOMTotalKey,
Help: "Amount of containers killed because they ran out of memory (OOM)",
},
)

// CRIOContainersOOM collects container out of memory (oom) metrics per container and sandbox name.
CRIOContainersOOM = prometheus.NewCounterVec(
prometheus.CounterOpts{
Subsystem: subsystem,
Name: CRIOContainersOOMKey,
Help: "Amount of containers killed because they ran out of memory (OOM) by their name",
},
[]string{"name"},
)
)

var registerMetrics sync.Once
Expand All @@ -162,6 +187,8 @@ func Register() {
prometheus.MustRegister(CRIOImagePullsFailures)
prometheus.MustRegister(CRIOImagePullsSuccesses)
prometheus.MustRegister(CRIOImageLayerReuse)
prometheus.MustRegister(CRIOContainersOOMTotal)
prometheus.MustRegister(CRIOContainersOOM)
})
}

Expand Down
32 changes: 0 additions & 32 deletions test/ctr.bats
Original file line number Diff line number Diff line change
Expand Up @@ -849,38 +849,6 @@ function wait_until_exit() {
! crictl create "$pod_id" "$newconfig" "$TESTDATA"/sandbox_config.json
}

@test "ctr expose metrics with default port" {
# start crio with default port 9090
port="9090"
CONTAINER_ENABLE_METRICS=true start_crio
if ! port_listens "$port"; then
skip "Metrics port $port not listening"
fi

pod_id=$(crictl runp "$TESTDATA"/sandbox_config.json)
ctr_id=$(crictl create "$pod_id" "$TESTDATA"/container_redis.json "$TESTDATA"/sandbox_config.json)
crictl start "$ctr_id"

# get metrics
curl http://localhost:$port/metrics -k
}

@test "ctr expose metrics with custom port" {
# start crio with custom port
port="4321"
CONTAINER_ENABLE_METRICS=true CONTAINER_METRICS_PORT=$port start_crio
if ! port_listens "$port"; then
skip "Metrics port $port not listening"
fi

pod_id=$(crictl runp "$TESTDATA"/sandbox_config.json)
ctr_id=$(crictl create "$pod_id" "$TESTDATA"/container_redis.json "$TESTDATA"/sandbox_config.json)
crictl start "$ctr_id"

# get metrics
curl http://localhost:$port/metrics -k
}

@test "privileged ctr -- check for rw mounts" {
# Can't run privileged container in userns
if test -n "$CONTAINER_UID_MAPPINGS"; then
Expand Down
5 changes: 5 additions & 0 deletions test/helpers.bash
Original file line number Diff line number Diff line change
Expand Up @@ -340,6 +340,11 @@ function check_journald() {
journalctl --version
}

# get a random available port
function free_port() {
python -c 'import socket; s=socket.socket(); s.bind(("", 0)); print(s.getsockname()[1]); s.close()'
}

# Check whether a port is listening
function port_listens() {
netstat -ln46 | grep -q ":$1\b"
Expand Down
64 changes: 64 additions & 0 deletions test/metrics.bats
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
#!/usr/bin/env bats
# vim: set syntax=sh:

load helpers

function setup() {
setup_test
}

function teardown() {
cleanup_test
}

@test "metrics with default port" {
# start crio with default port 9090
PORT="9090"
CONTAINER_ENABLE_METRICS=true start_crio
if ! port_listens "$PORT"; then
skip "Metrics port $PORT not listening"
fi

# get metrics
curl -sf "http://localhost:$PORT/metrics"
}

@test "metrics with random port" {
# start crio with custom port
PORT=$(free_port)
CONTAINER_ENABLE_METRICS=true CONTAINER_METRICS_PORT=$PORT start_crio

crictl run "$TESTDATA"/container_redis.json "$TESTDATA"/sandbox_config.json

# get metrics
curl -sf "http://localhost:$PORT/metrics" | grep crio_operations
}

@test "metrics container oom" {
PORT=$(free_port)
CONTAINER_ENABLE_METRICS=true CONTAINER_METRICS_PORT=$PORT start_crio

jq '.image.image = "quay.io/crio/oom"
| .linux.resources.memory_limit_in_bytes = 25165824
| .command = ["/oom"]' \
"$TESTDATA/container_config.json" > "$TESTDIR/config.json"
CTR_ID=$(crictl run "$TESTDIR/config.json" "$TESTDATA/sandbox_config.json")

# Wait for container to OOM
CNT=0
while [ $CNT -le 100 ]; do
CNT=$((CNT + 1))
OUTPUT=$(crictl inspect --output yaml "$CTR_ID")
if [[ "$OUTPUT" == *"OOMKilled"* ]]; then
break
fi
sleep 10
done
[[ "$OUTPUT" == *"OOMKilled"* ]]

METRIC=$(curl -sf "http://localhost:$PORT/metrics" | grep '^container_runtime_crio_containers_oom_total')
[[ "$METRIC" == 'container_runtime_crio_containers_oom_total 1' ]]

METRIC=$(curl -sf "http://localhost:$PORT/metrics" | grep 'crio_containers_oom{')
[[ "$METRIC" == 'container_runtime_crio_containers_oom{name="k8s_container1_podsandbox1_redhat.test.crio_redhat-test-crio_1"} 1' ]]
}
26 changes: 14 additions & 12 deletions tutorials/metrics.md
Original file line number Diff line number Diff line change
Expand Up @@ -28,18 +28,20 @@ endpoint manually via [curl][1].

Beside the [default golang based metrics][2], CRI-O provides the following additional metrics:

| Metric Key | Possible Labels | Type | Purpose |
| -------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------- | ------- | ------------------------------------------------------------------------ |
| `crio_operations` | every CRI-O RPC\* | Counter | Cumulative number of CRI-O operations by operation type. |
| `crio_operations_latency_microseconds_total` | every CRI-O RPC\*,<br><br>`network_setup_pod` (CNI pod network setup time),<br><br>`network_setup_overall` (Overall network setup time) | Summary | Latency in microseconds of CRI-O operations. Split-up by operation type. |
| `crio_operations_latency_microseconds` | every CRI-O RPC\* | Gauge | Latency in microseconds of individual CRI calls for CRI-O operations. Broken down by operation type. |
| `crio_operations_errors` | every CRI-O RPC\* | Counter | Cumulative number of CRI-O operation errors by operation type. |
| `crio_image_pulls_by_digest` | `name`, `digest`, `mediatype`, `size` | Counter | Bytes transferred by CRI-O image pulls by digest. |
| `crio_image_pulls_by_name` | `name`, `size` | Counter | Bytes transferred by CRI-O image pulls by name. |
| `crio_image_pulls_by_name_skipped` | `name` | Counter | Bytes skipped by CRI-O image pulls by name. |
| `crio_image_pulls_successes` | `name` | Counter | Successful image pulls by image name |
| `crio_image_pulls_failures` | `name`, `error` | Counter | Failed image pulls by image name and their error category. |
| `crio_image_layer_reuse` | `name` | Counter | Reused (not pulled) local image layer count by name. |
| Metric Key | Possible Labels | Type | Purpose |
| -------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------- | ------- | ---------------------------------------------------------------------------------------------------- |
| `crio_operations` | every CRI-O RPC\* | Counter | Cumulative number of CRI-O operations by operation type. |
| `crio_operations_latency_microseconds_total` | every CRI-O RPC\*,<br><br>`network_setup_pod` (CNI pod network setup time),<br><br>`network_setup_overall` (Overall network setup time) | Summary | Latency in microseconds of CRI-O operations. Split-up by operation type. |
| `crio_operations_latency_microseconds` | every CRI-O RPC\* | Gauge | Latency in microseconds of individual CRI calls for CRI-O operations. Broken down by operation type. |
| `crio_operations_errors` | every CRI-O RPC\* | Counter | Cumulative number of CRI-O operation errors by operation type. |
| `crio_image_pulls_by_digest` | `name`, `digest`, `mediatype`, `size` | Counter | Bytes transferred by CRI-O image pulls by digest. |
| `crio_image_pulls_by_name` | `name`, `size` | Counter | Bytes transferred by CRI-O image pulls by name. |
| `crio_image_pulls_by_name_skipped` | `name` | Counter | Bytes skipped by CRI-O image pulls by name. |
| `crio_image_pulls_successes` | `name` | Counter | Successful image pulls by image name |
| `crio_image_pulls_failures` | `name`, `error` | Counter | Failed image pulls by image name and their error category. |
| `crio_image_layer_reuse` | `name` | Counter | Reused (not pulled) local image layer count by name. |
| `crio_containers_oom_total` | | Counter | Total number of containers killed because they ran out of memory (OOM) |
| `crio_containers_oom` | `name` | Counter | Containers killed because they ran out of memory (OOM) by their name |

- Available CRI-O RPC's from the [gRPC API][3]: `Attach`, `ContainerStats`, `ContainerStatus`,
`CreateContainer`, `Exec`, `ExecSync`, `ImageFsInfo`, `ImageStatus`,
Expand Down

0 comments on commit 17b8f71

Please sign in to comment.