Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Monitoring] Fix a couple of issues with the cpu usage alert #80737

Merged
merged 4 commits into from Oct 26, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
Expand Up @@ -18,8 +18,9 @@ import {
import { NodeDetailStatus } from '../node_detail_status';
import { MonitoringTimeseriesContainer } from '../../chart';
import { FormattedMessage } from '@kbn/i18n/react';
import { AlertsCallout } from '../../../alerts/callout';

export const AdvancedNode = ({ nodeSummary, metrics, alerts, ...props }) => {
export const AdvancedNode = ({ nodeSummary, metrics, alerts, nodeId, ...props }) => {
const metricsToShow = [
metrics.node_gc,
metrics.node_gc_time,
Expand Down Expand Up @@ -50,9 +51,25 @@ export const AdvancedNode = ({ nodeSummary, metrics, alerts, ...props }) => {
</h1>
</EuiScreenReaderOnly>
<EuiPanel>
<NodeDetailStatus stats={nodeSummary} alerts={alerts} />
<NodeDetailStatus
stats={nodeSummary}
alerts={alerts}
alertsStateFilter={(state) =>
state.nodeId === nodeId || state.stackProductUuid === nodeId
}
/>
</EuiPanel>
<EuiSpacer size="m" />
<AlertsCallout
alerts={alerts}
stateFilter={(state) => state.nodeId === nodeId || state.stackProductUuid === nodeId}
nextStepsFilter={(nextStep) => {
if (nextStep.text.includes('Elasticsearch nodes')) {
return false;
}
return true;
}}
/>
<EuiPageContent>
<EuiFlexGrid columns={2} gutterSize="s">
{metricsToShow.map((metric, index) => (
Expand Down
Expand Up @@ -117,6 +117,7 @@ uiRoutes.when('/elasticsearch/nodes/:node/advanced', {
<AdvancedNode
nodeSummary={data.nodeSummary}
alerts={this.alerts}
nodeId={data.nodeSummary.resolver}
metrics={data.metrics}
onBrush={this.onBrush}
zoomInfo={this.zoomInfo}
Expand Down
7 changes: 2 additions & 5 deletions x-pack/plugins/monitoring/server/alerts/cpu_usage_alert.ts
Expand Up @@ -106,18 +106,15 @@ export class CpuUsageAlert extends BaseAlert {
this.config.ui.max_bucket_size
);
return stats.map((stat) => {
let cpuUsage = 0;
if (this.config.ui.container.elasticsearch.enabled) {
cpuUsage =
stat.cpuUsage =
(stat.containerUsage / (stat.containerPeriods * stat.containerQuota * 1000)) * 100;
} else {
cpuUsage = stat.cpuUsage;
}

return {
instanceKey: `${stat.clusterUuid}:${stat.nodeId}`,
clusterUuid: stat.clusterUuid,
shouldFire: cpuUsage > params.threshold,
shouldFire: stat.cpuUsage > params.threshold,
severity: AlertSeverity.Danger,
meta: stat,
ccs: stat.ccs,
Expand Down
Expand Up @@ -97,11 +97,18 @@ describe('fetchCpuUsageNodeStats', () => {
},
],
},
average_usage: {
value: 10,
},
average_periods: {
value: 5,
histo: {
buckets: [
null,
{
usage_deriv: {
normalized_value: 10,
},
periods_deriv: {
normalized_value: 5,
},
},
],
},
average_quota: {
value: 50,
Expand Down Expand Up @@ -185,38 +192,53 @@ describe('fetchCpuUsageNodeStats', () => {
});
await fetchCpuUsageNodeStats(callCluster, clusters, index, startMs, endMs, size);
expect(params).toStrictEqual({
index,
index: '.monitoring-es-*',
filterPath: ['aggregations'],
body: {
size: 0,
query: {
bool: {
filter: [
{ terms: { cluster_uuid: clusters.map((cluster) => cluster.clusterUuid) } },
{ terms: { cluster_uuid: ['abc123'] } },
{ term: { type: 'node_stats' } },
{ range: { timestamp: { format: 'epoch_millis', gte: 0, lte: 0 } } },
],
},
},
aggs: {
clusters: {
terms: {
field: 'cluster_uuid',
size,
include: clusters.map((cluster) => cluster.clusterUuid),
},
terms: { field: 'cluster_uuid', size: 10, include: ['abc123'] },
aggs: {
nodes: {
terms: { field: 'node_stats.node_id', size },
terms: { field: 'node_stats.node_id', size: 10 },
aggs: {
index: { terms: { field: '_index', size: 1 } },
average_cpu: { avg: { field: 'node_stats.process.cpu.percent' } },
average_usage: { avg: { field: 'node_stats.os.cgroup.cpuacct.usage_nanos' } },
average_periods: {
avg: { field: 'node_stats.os.cgroup.cpu.stat.number_of_elapsed_periods' },
},
average_quota: { avg: { field: 'node_stats.os.cgroup.cpu.cfs_quota_micros' } },
name: { terms: { field: 'source_node.name', size: 1 } },
histo: {
date_histogram: { field: 'timestamp', fixed_interval: '0m' },
aggs: {
average_periods: {
max: { field: 'node_stats.os.cgroup.cpu.stat.number_of_elapsed_periods' },
},
average_usage: { max: { field: 'node_stats.os.cgroup.cpuacct.usage_nanos' } },
usage_deriv: {
derivative: {
buckets_path: 'average_usage',
gap_policy: 'skip',
unit: '1s',
},
},
periods_deriv: {
derivative: {
buckets_path: 'average_periods',
gap_policy: 'skip',
unit: '1s',
},
},
},
},
},
},
},
Expand Down
Expand Up @@ -4,6 +4,8 @@
* you may not use this file except in compliance with the Elastic License.
*/
import { get } from 'lodash';
import moment from 'moment';
import { NORMALIZED_DERIVATIVE_UNIT } from '../../../common/constants';
import { AlertCluster, AlertCpuUsageNodeStats } from '../../alerts/types';

interface NodeBucketESResponse {
Expand All @@ -26,6 +28,9 @@ export async function fetchCpuUsageNodeStats(
endMs: number,
size: number
): Promise<AlertCpuUsageNodeStats[]> {
// Using pure MS didn't seem to work well with the date_histogram interval
// but minutes does
const intervalInMinutes = moment.duration(endMs - startMs).asMinutes();
const filterPath = ['aggregations'];
const params = {
index,
Expand Down Expand Up @@ -82,16 +87,6 @@ export async function fetchCpuUsageNodeStats(
field: 'node_stats.process.cpu.percent',
},
},
average_usage: {
avg: {
field: 'node_stats.os.cgroup.cpuacct.usage_nanos',
},
},
average_periods: {
avg: {
field: 'node_stats.os.cgroup.cpu.stat.number_of_elapsed_periods',
},
},
average_quota: {
avg: {
field: 'node_stats.os.cgroup.cpu.cfs_quota_micros',
Expand All @@ -103,6 +98,38 @@ export async function fetchCpuUsageNodeStats(
size: 1,
},
},
histo: {
date_histogram: {
field: 'timestamp',
fixed_interval: `${intervalInMinutes}m`,
},
aggs: {
average_periods: {
max: {
field: 'node_stats.os.cgroup.cpu.stat.number_of_elapsed_periods',
},
},
average_usage: {
max: {
field: 'node_stats.os.cgroup.cpuacct.usage_nanos',
},
},
usage_deriv: {
derivative: {
buckets_path: 'average_usage',
gap_policy: 'skip',
unit: NORMALIZED_DERIVATIVE_UNIT,
},
},
periods_deriv: {
derivative: {
buckets_path: 'average_periods',
gap_policy: 'skip',
unit: NORMALIZED_DERIVATIVE_UNIT,
},
},
},
},
},
},
},
Expand All @@ -120,17 +147,19 @@ export async function fetchCpuUsageNodeStats(
) as ClusterBucketESResponse[];
for (const clusterBucket of clusterBuckets) {
for (const node of clusterBucket.nodes.buckets) {
const lastBucket = get(node, 'histo.buckets[1]', {});
const indexName = get(node, 'index.buckets[0].key', '');
stats.push({
const stat = {
clusterUuid: clusterBucket.key,
nodeId: node.key,
nodeName: get(node, 'name.buckets[0].key'),
cpuUsage: get(node, 'average_cpu.value'),
containerUsage: get(node, 'average_usage.value'),
containerPeriods: get(node, 'average_periods.value'),
containerUsage: get(lastBucket, 'usage_deriv.normalized_value'),
containerPeriods: get(lastBucket, 'periods_deriv.normalized_value'),
containerQuota: get(node, 'average_quota.value'),
ccs: indexName.includes(':') ? indexName.split(':')[0] : null,
});
};
stats.push(stat);
}
}
return stats;
Expand Down