Skip to content

Commit

Permalink
[Monitoring] Fix a couple of issues with the cpu usage alert (#80737)
Browse files Browse the repository at this point in the history
* Fix a couple of issues with the cpu usage alert

* Fix tests

* PR feedback
  • Loading branch information
chrisronline committed Oct 26, 2020
1 parent 582e8e9 commit d784840
Show file tree
Hide file tree
Showing 5 changed files with 104 additions and 38 deletions.
Expand Up @@ -18,8 +18,9 @@ import {
import { NodeDetailStatus } from '../node_detail_status';
import { MonitoringTimeseriesContainer } from '../../chart';
import { FormattedMessage } from '@kbn/i18n/react';
import { AlertsCallout } from '../../../alerts/callout';

export const AdvancedNode = ({ nodeSummary, metrics, alerts, ...props }) => {
export const AdvancedNode = ({ nodeSummary, metrics, alerts, nodeId, ...props }) => {
const metricsToShow = [
metrics.node_gc,
metrics.node_gc_time,
Expand Down Expand Up @@ -50,9 +51,25 @@ export const AdvancedNode = ({ nodeSummary, metrics, alerts, ...props }) => {
</h1>
</EuiScreenReaderOnly>
<EuiPanel>
<NodeDetailStatus stats={nodeSummary} alerts={alerts} />
<NodeDetailStatus
stats={nodeSummary}
alerts={alerts}
alertsStateFilter={(state) =>
state.nodeId === nodeId || state.stackProductUuid === nodeId
}
/>
</EuiPanel>
<EuiSpacer size="m" />
<AlertsCallout
alerts={alerts}
stateFilter={(state) => state.nodeId === nodeId || state.stackProductUuid === nodeId}
nextStepsFilter={(nextStep) => {
if (nextStep.text.includes('Elasticsearch nodes')) {
return false;
}
return true;
}}
/>
<EuiPageContent>
<EuiFlexGrid columns={2} gutterSize="s">
{metricsToShow.map((metric, index) => (
Expand Down
Expand Up @@ -117,6 +117,7 @@ uiRoutes.when('/elasticsearch/nodes/:node/advanced', {
<AdvancedNode
nodeSummary={data.nodeSummary}
alerts={this.alerts}
nodeId={data.nodeSummary.resolver}
metrics={data.metrics}
onBrush={this.onBrush}
zoomInfo={this.zoomInfo}
Expand Down
7 changes: 2 additions & 5 deletions x-pack/plugins/monitoring/server/alerts/cpu_usage_alert.ts
Expand Up @@ -106,18 +106,15 @@ export class CpuUsageAlert extends BaseAlert {
this.config.ui.max_bucket_size
);
return stats.map((stat) => {
let cpuUsage = 0;
if (this.config.ui.container.elasticsearch.enabled) {
cpuUsage =
stat.cpuUsage =
(stat.containerUsage / (stat.containerPeriods * stat.containerQuota * 1000)) * 100;
} else {
cpuUsage = stat.cpuUsage;
}

return {
instanceKey: `${stat.clusterUuid}:${stat.nodeId}`,
clusterUuid: stat.clusterUuid,
shouldFire: cpuUsage > params.threshold,
shouldFire: stat.cpuUsage > params.threshold,
severity: AlertSeverity.Danger,
meta: stat,
ccs: stat.ccs,
Expand Down
Expand Up @@ -97,11 +97,18 @@ describe('fetchCpuUsageNodeStats', () => {
},
],
},
average_usage: {
value: 10,
},
average_periods: {
value: 5,
histo: {
buckets: [
null,
{
usage_deriv: {
normalized_value: 10,
},
periods_deriv: {
normalized_value: 5,
},
},
],
},
average_quota: {
value: 50,
Expand Down Expand Up @@ -185,38 +192,53 @@ describe('fetchCpuUsageNodeStats', () => {
});
await fetchCpuUsageNodeStats(callCluster, clusters, index, startMs, endMs, size);
expect(params).toStrictEqual({
index,
index: '.monitoring-es-*',
filterPath: ['aggregations'],
body: {
size: 0,
query: {
bool: {
filter: [
{ terms: { cluster_uuid: clusters.map((cluster) => cluster.clusterUuid) } },
{ terms: { cluster_uuid: ['abc123'] } },
{ term: { type: 'node_stats' } },
{ range: { timestamp: { format: 'epoch_millis', gte: 0, lte: 0 } } },
],
},
},
aggs: {
clusters: {
terms: {
field: 'cluster_uuid',
size,
include: clusters.map((cluster) => cluster.clusterUuid),
},
terms: { field: 'cluster_uuid', size: 10, include: ['abc123'] },
aggs: {
nodes: {
terms: { field: 'node_stats.node_id', size },
terms: { field: 'node_stats.node_id', size: 10 },
aggs: {
index: { terms: { field: '_index', size: 1 } },
average_cpu: { avg: { field: 'node_stats.process.cpu.percent' } },
average_usage: { avg: { field: 'node_stats.os.cgroup.cpuacct.usage_nanos' } },
average_periods: {
avg: { field: 'node_stats.os.cgroup.cpu.stat.number_of_elapsed_periods' },
},
average_quota: { avg: { field: 'node_stats.os.cgroup.cpu.cfs_quota_micros' } },
name: { terms: { field: 'source_node.name', size: 1 } },
histo: {
date_histogram: { field: 'timestamp', fixed_interval: '0m' },
aggs: {
average_periods: {
max: { field: 'node_stats.os.cgroup.cpu.stat.number_of_elapsed_periods' },
},
average_usage: { max: { field: 'node_stats.os.cgroup.cpuacct.usage_nanos' } },
usage_deriv: {
derivative: {
buckets_path: 'average_usage',
gap_policy: 'skip',
unit: '1s',
},
},
periods_deriv: {
derivative: {
buckets_path: 'average_periods',
gap_policy: 'skip',
unit: '1s',
},
},
},
},
},
},
},
Expand Down
Expand Up @@ -4,6 +4,8 @@
* you may not use this file except in compliance with the Elastic License.
*/
import { get } from 'lodash';
import moment from 'moment';
import { NORMALIZED_DERIVATIVE_UNIT } from '../../../common/constants';
import { AlertCluster, AlertCpuUsageNodeStats } from '../../alerts/types';

interface NodeBucketESResponse {
Expand All @@ -26,6 +28,9 @@ export async function fetchCpuUsageNodeStats(
endMs: number,
size: number
): Promise<AlertCpuUsageNodeStats[]> {
// Using pure MS didn't seem to work well with the date_histogram interval
// but minutes does
const intervalInMinutes = moment.duration(endMs - startMs).asMinutes();
const filterPath = ['aggregations'];
const params = {
index,
Expand Down Expand Up @@ -82,16 +87,6 @@ export async function fetchCpuUsageNodeStats(
field: 'node_stats.process.cpu.percent',
},
},
average_usage: {
avg: {
field: 'node_stats.os.cgroup.cpuacct.usage_nanos',
},
},
average_periods: {
avg: {
field: 'node_stats.os.cgroup.cpu.stat.number_of_elapsed_periods',
},
},
average_quota: {
avg: {
field: 'node_stats.os.cgroup.cpu.cfs_quota_micros',
Expand All @@ -103,6 +98,38 @@ export async function fetchCpuUsageNodeStats(
size: 1,
},
},
histo: {
date_histogram: {
field: 'timestamp',
fixed_interval: `${intervalInMinutes}m`,
},
aggs: {
average_periods: {
max: {
field: 'node_stats.os.cgroup.cpu.stat.number_of_elapsed_periods',
},
},
average_usage: {
max: {
field: 'node_stats.os.cgroup.cpuacct.usage_nanos',
},
},
usage_deriv: {
derivative: {
buckets_path: 'average_usage',
gap_policy: 'skip',
unit: NORMALIZED_DERIVATIVE_UNIT,
},
},
periods_deriv: {
derivative: {
buckets_path: 'average_periods',
gap_policy: 'skip',
unit: NORMALIZED_DERIVATIVE_UNIT,
},
},
},
},
},
},
},
Expand All @@ -120,17 +147,19 @@ export async function fetchCpuUsageNodeStats(
) as ClusterBucketESResponse[];
for (const clusterBucket of clusterBuckets) {
for (const node of clusterBucket.nodes.buckets) {
const lastBucket = get(node, 'histo.buckets[1]', {});
const indexName = get(node, 'index.buckets[0].key', '');
stats.push({
const stat = {
clusterUuid: clusterBucket.key,
nodeId: node.key,
nodeName: get(node, 'name.buckets[0].key'),
cpuUsage: get(node, 'average_cpu.value'),
containerUsage: get(node, 'average_usage.value'),
containerPeriods: get(node, 'average_periods.value'),
containerUsage: get(lastBucket, 'usage_deriv.normalized_value'),
containerPeriods: get(lastBucket, 'periods_deriv.normalized_value'),
containerQuota: get(node, 'average_quota.value'),
ccs: indexName.includes(':') ? indexName.split(':')[0] : null,
});
};
stats.push(stat);
}
}
return stats;
Expand Down

0 comments on commit d784840

Please sign in to comment.