Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[1.20] Add container out of memory metrics #5706

Merged
merged 1 commit into from
Jul 15, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
12 changes: 12 additions & 0 deletions internal/oci/runtime_oci.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ import (
"github.com/cri-o/cri-o/internal/log"
"github.com/cri-o/cri-o/pkg/config"
types "github.com/cri-o/cri-o/server/cri/types"
"github.com/cri-o/cri-o/server/metrics"
"github.com/cri-o/cri-o/utils"
"github.com/cri-o/cri-o/utils/cmdrunner"
"github.com/fsnotify/fsnotify"
Expand Down Expand Up @@ -921,6 +922,17 @@ func (r *runtimeOCI) UpdateContainerStatus(c *Container) error {
oomFilePath := filepath.Join(c.bundlePath, "oom")
if _, err = os.Stat(oomFilePath); err == nil {
c.state.OOMKilled = true

// Collect total metric
metrics.CRIOContainersOOMTotal.Inc()

// Collect metric by container name
counter, err := metrics.CRIOContainersOOM.GetMetricWithLabelValues(c.Name())
if err != nil {
log.Warnf(context.Background(), "Unable to write OOM metric by container: %v", err)
} else {
counter.Inc()
}
}

return nil
Expand Down
16 changes: 14 additions & 2 deletions internal/oci/runtime_vm.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ import (
"github.com/containerd/ttrpc"
"github.com/containers/libpod/v2/pkg/cgroups"
types "github.com/cri-o/cri-o/server/cri/types"
"github.com/cri-o/cri-o/server/metrics"
"github.com/cri-o/cri-o/utils"
"github.com/cri-o/cri-o/utils/errdefs"
"github.com/cri-o/cri-o/utils/fifo"
Expand Down Expand Up @@ -663,6 +664,17 @@ func (r *runtimeVM) updateContainerStatus(c *Container) error {
oomFilePath := filepath.Join(c.bundlePath, "oom")
if _, err = os.Stat(oomFilePath); err == nil {
c.state.OOMKilled = true

// Collect total metric
metrics.CRIOContainersOOMTotal.Inc()

// Collect metric by container name
counter, err := metrics.CRIOContainersOOM.GetMetricWithLabelValues(c.Name())
if err != nil {
logrus.Warnf("Unable to write OOM metric by container: %v", err)
} else {
counter.Inc()
}
}
}
return nil
Expand Down Expand Up @@ -728,12 +740,12 @@ func (r *runtimeVM) ContainerStats(c *Container, _ string) (*ContainerStats, err
return nil, errors.Wrap(err, "failed to extract container metrics")
}

metrics, ok := stats.(*cgroups.Metrics)
m, ok := stats.(*cgroups.Metrics)
if !ok {
return nil, errors.Errorf("Unknown stats type %T", stats)
}

return metricsToCtrStats(c, metrics), nil
return metricsToCtrStats(c, m), nil
}

// SignalContainer sends a signal to a container process.
Expand Down
27 changes: 27 additions & 0 deletions server/metrics/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,12 @@ const (
// CRIOImageLayerReuseKey is the key for the CRI-O image layer reuse metrics.
CRIOImageLayerReuseKey = "crio_image_layer_reuse"

// CRIOContainersOOMTotalKey is the key for the total CRI-O container out of memory metrics.
CRIOContainersOOMTotalKey = "crio_containers_oom_total"

// CRIOContainersOOMKey is the key for the CRI-O container out of memory metrics per container name.
CRIOContainersOOMKey = "crio_containers_oom"

subsystem = "container_runtime"
)

Expand Down Expand Up @@ -145,6 +151,25 @@ var (
},
[]string{"name"},
)

// CRIOContainersOOMTotal collects container out of memory (oom) metrics for every container and sandboxes.
CRIOContainersOOMTotal = prometheus.NewCounter(
prometheus.CounterOpts{
Subsystem: subsystem,
Name: CRIOContainersOOMTotalKey,
Help: "Amount of containers killed because they ran out of memory (OOM)",
},
)

// CRIOContainersOOM collects container out of memory (oom) metrics per container and sandbox name.
CRIOContainersOOM = prometheus.NewCounterVec(
prometheus.CounterOpts{
Subsystem: subsystem,
Name: CRIOContainersOOMKey,
Help: "Amount of containers killed because they ran out of memory (OOM) by their name",
},
[]string{"name"},
)
)

var registerMetrics sync.Once
Expand All @@ -162,6 +187,8 @@ func Register() {
prometheus.MustRegister(CRIOImagePullsFailures)
prometheus.MustRegister(CRIOImagePullsSuccesses)
prometheus.MustRegister(CRIOImageLayerReuse)
prometheus.MustRegister(CRIOContainersOOMTotal)
prometheus.MustRegister(CRIOContainersOOM)
})
}

Expand Down
32 changes: 0 additions & 32 deletions test/ctr.bats
Original file line number Diff line number Diff line change
Expand Up @@ -857,38 +857,6 @@ function wait_until_exit() {
! crictl create "$pod_id" "$newconfig" "$TESTDATA"/sandbox_config.json
}

@test "ctr expose metrics with default port" {
# start crio with default port 9090
port="9090"
CONTAINER_ENABLE_METRICS=true start_crio
if ! port_listens "$port"; then
skip "Metrics port $port not listening"
fi

pod_id=$(crictl runp "$TESTDATA"/sandbox_config.json)
ctr_id=$(crictl create "$pod_id" "$TESTDATA"/container_redis.json "$TESTDATA"/sandbox_config.json)
crictl start "$ctr_id"

# get metrics
curl http://localhost:$port/metrics -k
}

@test "ctr expose metrics with custom port" {
# start crio with custom port
port="4321"
CONTAINER_ENABLE_METRICS=true CONTAINER_METRICS_PORT=$port start_crio
if ! port_listens "$port"; then
skip "Metrics port $port not listening"
fi

pod_id=$(crictl runp "$TESTDATA"/sandbox_config.json)
ctr_id=$(crictl create "$pod_id" "$TESTDATA"/container_redis.json "$TESTDATA"/sandbox_config.json)
crictl start "$ctr_id"

# get metrics
curl http://localhost:$port/metrics -k
}

@test "privileged ctr -- check for rw mounts" {
# Can't run privileged container in userns
if test -n "$CONTAINER_UID_MAPPINGS"; then
Expand Down
5 changes: 5 additions & 0 deletions test/helpers.bash
Original file line number Diff line number Diff line change
Expand Up @@ -340,6 +340,11 @@ function check_journald() {
journalctl --version
}

# get a random available port
function free_port() {
python -c 'import socket; s=socket.socket(); s.bind(("", 0)); print(s.getsockname()[1]); s.close()'
}

# Check whether a port is listening
function port_listens() {
netstat -ln46 | grep -q ":$1\b"
Expand Down
64 changes: 64 additions & 0 deletions test/metrics.bats
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
#!/usr/bin/env bats
# vim: set syntax=sh:

load helpers

function setup() {
setup_test
}

function teardown() {
cleanup_test
}

@test "metrics with default port" {
# start crio with default port 9090
PORT="9090"
CONTAINER_ENABLE_METRICS=true start_crio
if ! port_listens "$PORT"; then
skip "Metrics port $PORT not listening"
fi

# get metrics
curl -sf "http://localhost:$PORT/metrics"
}

@test "metrics with random port" {
# start crio with custom port
PORT=$(free_port)
CONTAINER_ENABLE_METRICS=true CONTAINER_METRICS_PORT=$PORT start_crio

crictl run "$TESTDATA"/container_redis.json "$TESTDATA"/sandbox_config.json

# get metrics
curl -sf "http://localhost:$PORT/metrics" | grep crio_operations
}

@test "metrics container oom" {
PORT=$(free_port)
CONTAINER_ENABLE_METRICS=true CONTAINER_METRICS_PORT=$PORT start_crio

jq '.image.image = "quay.io/crio/oom"
| .linux.resources.memory_limit_in_bytes = 25165824
| .command = ["/oom"]' \
"$TESTDATA/container_config.json" > "$TESTDIR/config.json"
CTR_ID=$(crictl run "$TESTDIR/config.json" "$TESTDATA/sandbox_config.json")

# Wait for container to OOM
CNT=0
while [ $CNT -le 100 ]; do
CNT=$((CNT + 1))
OUTPUT=$(crictl inspect --output yaml "$CTR_ID")
if [[ "$OUTPUT" == *"OOMKilled"* ]]; then
break
fi
sleep 10
done
[[ "$OUTPUT" == *"OOMKilled"* ]]

METRIC=$(curl -sf "http://localhost:$PORT/metrics" | grep '^container_runtime_crio_containers_oom_total')
[[ "$METRIC" == 'container_runtime_crio_containers_oom_total 1' ]]

METRIC=$(curl -sf "http://localhost:$PORT/metrics" | grep 'crio_containers_oom{')
[[ "$METRIC" == 'container_runtime_crio_containers_oom{name="k8s_container1_podsandbox1_redhat.test.crio_redhat-test-crio_1"} 1' ]]
}
26 changes: 14 additions & 12 deletions tutorials/metrics.md
Original file line number Diff line number Diff line change
Expand Up @@ -28,18 +28,20 @@ endpoint manually via [curl][1].

Beside the [default golang based metrics][2], CRI-O provides the following additional metrics:

| Metric Key | Possible Labels | Type | Purpose |
| -------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------- | ------- | ------------------------------------------------------------------------ |
| `crio_operations` | every CRI-O RPC\* | Counter | Cumulative number of CRI-O operations by operation type. |
| `crio_operations_latency_microseconds_total` | every CRI-O RPC\*,<br><br>`network_setup_pod` (CNI pod network setup time),<br><br>`network_setup_overall` (Overall network setup time) | Summary | Latency in microseconds of CRI-O operations. Split-up by operation type. |
| `crio_operations_latency_microseconds` | every CRI-O RPC\* | Gauge | Latency in microseconds of individual CRI calls for CRI-O operations. Broken down by operation type. |
| `crio_operations_errors` | every CRI-O RPC\* | Counter | Cumulative number of CRI-O operation errors by operation type. |
| `crio_image_pulls_by_digest` | `name`, `digest`, `mediatype`, `size` | Counter | Bytes transferred by CRI-O image pulls by digest. |
| `crio_image_pulls_by_name` | `name`, `size` | Counter | Bytes transferred by CRI-O image pulls by name. |
| `crio_image_pulls_by_name_skipped` | `name` | Counter | Bytes skipped by CRI-O image pulls by name. |
| `crio_image_pulls_successes` | `name` | Counter | Successful image pulls by image name |
| `crio_image_pulls_failures` | `name`, `error` | Counter | Failed image pulls by image name and their error category. |
| `crio_image_layer_reuse` | `name` | Counter | Reused (not pulled) local image layer count by name. |
| Metric Key | Possible Labels | Type | Purpose |
| -------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------- | ------- | ---------------------------------------------------------------------------------------------------- |
| `crio_operations` | every CRI-O RPC\* | Counter | Cumulative number of CRI-O operations by operation type. |
| `crio_operations_latency_microseconds_total` | every CRI-O RPC\*,<br><br>`network_setup_pod` (CNI pod network setup time),<br><br>`network_setup_overall` (Overall network setup time) | Summary | Latency in microseconds of CRI-O operations. Split-up by operation type. |
| `crio_operations_latency_microseconds` | every CRI-O RPC\* | Gauge | Latency in microseconds of individual CRI calls for CRI-O operations. Broken down by operation type. |
| `crio_operations_errors` | every CRI-O RPC\* | Counter | Cumulative number of CRI-O operation errors by operation type. |
| `crio_image_pulls_by_digest` | `name`, `digest`, `mediatype`, `size` | Counter | Bytes transferred by CRI-O image pulls by digest. |
| `crio_image_pulls_by_name` | `name`, `size` | Counter | Bytes transferred by CRI-O image pulls by name. |
| `crio_image_pulls_by_name_skipped` | `name` | Counter | Bytes skipped by CRI-O image pulls by name. |
| `crio_image_pulls_successes` | `name` | Counter | Successful image pulls by image name |
| `crio_image_pulls_failures` | `name`, `error` | Counter | Failed image pulls by image name and their error category. |
| `crio_image_layer_reuse` | `name` | Counter | Reused (not pulled) local image layer count by name. |
| `crio_containers_oom_total` | | Counter | Total number of containers killed because they ran out of memory (OOM) |
| `crio_containers_oom` | `name` | Counter | Containers killed because they ran out of memory (OOM) by their name |

- Available CRI-O RPC's from the [gRPC API][3]: `Attach`, `ContainerStats`, `ContainerStatus`,
`CreateContainer`, `Exec`, `ExecSync`, `ImageFsInfo`, `ImageStatus`,
Expand Down