diff --git a/changelog/fragments/1763382303-prometheus-otel-self-monitoring.yaml b/changelog/fragments/1763382303-prometheus-otel-self-monitoring.yaml new file mode 100644 index 00000000000..46dd652f3fa --- /dev/null +++ b/changelog/fragments/1763382303-prometheus-otel-self-monitoring.yaml @@ -0,0 +1,32 @@ +# Kind can be one of: +# - breaking-change: a change to previously-documented behavior +# - deprecation: functionality that is being removed in a later release +# - bug-fix: fixes a problem in a previous version +# - enhancement: extends functionality but does not break or fix existing behavior +# - feature: new functionality +# - known-issue: problems that we are aware of in a given version +# - security: impacts on the security of a product or a user’s deployment. +# - upgrade: important information for someone upgrading from a prior version +# - other: does not fit into any of the other categories +kind: bug-fix + +# Change summary; a 80ish characters long description of the change. +summary: Ensure the monitoring input for the Otel collector can only run inside the collector. + +# Long description; in case the summary is not enough to describe the change +# this field accommodate a description without length limits. +# NOTE: This field will be rendered only for breaking-change and known-issue kinds at the moment. +#description: + +# Affected component; usually one of "elastic-agent", "fleet-server", "filebeat", "metricbeat", "auditbeat", "all", etc. +component: elastic-agent + +# PR URL; optional; the PR number that added the changeset. +# If not present is automatically filled by the tooling finding the PR where this changelog fragment has been added. +# NOTE: the tooling supports backports, so it's able to fill the original PR number instead of the backport PR number. +# Please provide it if you are adding a fragment for a different PR. +#pr: https://github.com/owner/repo/1234 + +# Issue URL; optional; the GitHub issue related to this changeset (either closes or is part of). +# If not present is automatically filled by the tooling with the issue linked to the PR number. +#issue: https://github.com/owner/repo/1234 diff --git a/internal/pkg/agent/application/coordinator/coordinator.go b/internal/pkg/agent/application/coordinator/coordinator.go index a0067d3190b..945163b1830 100644 --- a/internal/pkg/agent/application/coordinator/coordinator.go +++ b/internal/pkg/agent/application/coordinator/coordinator.go @@ -1817,6 +1817,12 @@ func (c *Coordinator) splitModelBetweenManagers(model *component.Model) (runtime case component.OtelRuntimeManager: otelComponents = append(otelComponents, comp) case component.ProcessRuntimeManager: + // Hack to fix https://github.com/elastic/elastic-agent/issues/11169 + // TODO: Remove this after https://github.com/elastic/elastic-agent/issues/10220 is resolved + if comp.ID == "prometheus/metrics-monitoring" { + c.logger.Warnf("The Otel prometheus metrics monitoring input can't run in a beats process, skipping") + continue + } runtimeComponents = append(runtimeComponents, comp) default: // this should be impossible if we parse the configuration correctly diff --git a/testing/integration/ess/beat_receivers_test.go b/testing/integration/ess/beat_receivers_test.go index 6e7e8fb8a6a..936f2780846 100644 --- a/testing/integration/ess/beat_receivers_test.go +++ b/testing/integration/ess/beat_receivers_test.go @@ -817,6 +817,12 @@ agent.monitoring.enabled: false } } +// Log lines TestBeatsReceiverProcessRuntimeFallback checks for +const ( + otelRuntimeUnsupportedLogLineStart = "otel runtime is not supported" + prometheusInputSkippedLogLine = "The Otel prometheus metrics monitoring input can't run in a beats process, skipping" +) + // TestBeatsReceiverProcessRuntimeFallback verifies that we fall back to the process runtime if the otel runtime // does not support the requested configuration. func TestBeatsReceiverProcessRuntimeFallback(t *testing.T) { @@ -848,7 +854,6 @@ outputs: hosts: [http://localhost:9200] api_key: placeholder indices: [] # not supported by the elasticsearch exporter -agent.monitoring.enabled: false ` // this is the context for the whole test, with a global timeout defined @@ -872,13 +877,14 @@ agent.monitoring.enabled: false status, statusErr := fixture.ExecStatus(ctx) assert.NoError(collect, statusErr) // we should be running beats processes even though the otel runtime was requested - assertBeatsHealthy(collect, &status, component.ProcessRuntimeManager, 1) + assertBeatsHealthy(collect, &status, component.ProcessRuntimeManager, 4) }, 1*time.Minute, 1*time.Second) logsBytes, err := fixture.Exec(ctx, []string{"logs", "-n", "1000", "--exclude-events"}) require.NoError(t, err) // verify we've logged a warning about using the process runtime - var unsupportedLogRecord map[string]any + var unsupportedLogRecords []map[string]any + var prometheusUnsupportedLogRecord map[string]any for _, line := range strings.Split(string(logsBytes), "\n") { line = strings.TrimSpace(line) if line == "" { @@ -889,9 +895,13 @@ agent.monitoring.enabled: false continue } - if message, ok := logRecord["message"].(string); ok && strings.HasPrefix(message, "otel runtime is not supported") { - unsupportedLogRecord = logRecord - break + if message, ok := logRecord["message"].(string); ok { + if strings.HasPrefix(message, otelRuntimeUnsupportedLogLineStart) { + unsupportedLogRecords = append(unsupportedLogRecords, logRecord) + } + if strings.HasPrefix(message, prometheusInputSkippedLogLine) { + prometheusUnsupportedLogRecord = logRecord + } } } @@ -902,11 +912,8 @@ agent.monitoring.enabled: false } }) - require.NotNil(t, unsupportedLogRecord, "unsupported log message should be present") - message, ok := unsupportedLogRecord["message"].(string) - require.True(t, ok, "log message field should be a string") - expectedMessage := "otel runtime is not supported for component system/metrics-default, switching to process runtime, reason: unsupported configuration for system/metrics-default: error translating config for output: default, unit: system/metrics-default, error: indices is currently not supported: unsupported operation" - assert.Equal(t, expectedMessage, message) + assert.Len(t, unsupportedLogRecords, 5, "one log line for each component we try to run") + assert.NotEmpty(t, prometheusUnsupportedLogRecord, "should get a log line about Otel prometheus metrics input being skipped") } // TestComponentWorkDir verifies that the component working directory is not deleted when moving the component from