diff --git a/x-pack/plugins/alerting/server/usage/alerting_telemetry.test.ts b/x-pack/plugins/alerting/server/usage/alerting_telemetry.test.ts index 4e34fc2a04f302..3bb64ad00a194a 100644 --- a/x-pack/plugins/alerting/server/usage/alerting_telemetry.test.ts +++ b/x-pack/plugins/alerting/server/usage/alerting_telemetry.test.ts @@ -151,6 +151,16 @@ Object { 'logs.alert.document.count': 1675765, 'document.test.': 17687687, }, + ruleTypesEsSearchDuration: { + '.index-threshold': 23, + 'logs.alert.document.count': 526, + 'document.test.': 534, + }, + ruleTypesTotalSearchDuration: { + '.index-threshold': 62, + 'logs.alert.document.count': 588, + 'document.test.': 637, + }, }, }, failuresByReason: { @@ -165,6 +175,12 @@ Object { }, }, avgDuration: { value: 10 }, + avgEsSearchDuration: { + value: 25.785714285714285, + }, + avgTotalSearchDuration: { + value: 30.642857142857142, + }, }, hits: { hits: [], @@ -177,12 +193,24 @@ Object { expect(mockEsClient.search).toHaveBeenCalledTimes(1); expect(telemetry).toStrictEqual({ + avgEsSearchDuration: 26, + avgEsSearchDurationByType: { + '__index-threshold': 12, + document__test__: 534, + logs__alert__document__count: 526, + }, avgExecutionTime: 0, avgExecutionTimeByType: { '__index-threshold': 1043934, document__test__: 17687687, logs__alert__document__count: 1675765, }, + avgTotalSearchDuration: 31, + avgTotalSearchDurationByType: { + '__index-threshold': 31, + document__test__: 637, + logs__alert__document__count: 588, + }, countByType: { '__index-threshold': 2, document__test__: 1, diff --git a/x-pack/plugins/alerting/server/usage/alerting_telemetry.ts b/x-pack/plugins/alerting/server/usage/alerting_telemetry.ts index b77083c62f0002..4fbad593d1600a 100644 --- a/x-pack/plugins/alerting/server/usage/alerting_telemetry.ts +++ b/x-pack/plugins/alerting/server/usage/alerting_telemetry.ts @@ -40,12 +40,17 @@ const ruleTypeMetric = { const ruleTypeExecutionsWithDurationMetric = { scripted_metric: { - init_script: 'state.ruleTypes = [:]; state.ruleTypesDuration = [:];', + init_script: + 'state.ruleTypes = [:]; state.ruleTypesDuration = [:]; state.ruleTypesEsSearchDuration = [:]; state.ruleTypesTotalSearchDuration = [:];', map_script: ` String ruleType = doc['rule.category'].value; long duration = doc['event.duration'].value / (1000 * 1000); + long esSearchDuration = doc['kibana.alert.rule.execution.metrics.es_search_duration_ms'].empty ? 0 : doc['kibana.alert.rule.execution.metrics.es_search_duration_ms'].value; + long totalSearchDuration = doc['kibana.alert.rule.execution.metrics.total_search_duration_ms'].empty ? 0 : doc['kibana.alert.rule.execution.metrics.total_search_duration_ms'].value; state.ruleTypes.put(ruleType, state.ruleTypes.containsKey(ruleType) ? state.ruleTypes.get(ruleType) + 1 : 1); state.ruleTypesDuration.put(ruleType, state.ruleTypesDuration.containsKey(ruleType) ? state.ruleTypesDuration.get(ruleType) + duration : duration); + state.ruleTypesEsSearchDuration.put(ruleType, state.ruleTypesEsSearchDuration.containsKey(ruleType) ? state.ruleTypesEsSearchDuration.get(ruleType) + esSearchDuration : esSearchDuration); + state.ruleTypesTotalSearchDuration.put(ruleType, state.ruleTypesTotalSearchDuration.containsKey(ruleType) ? state.ruleTypesTotalSearchDuration.get(ruleType) + totalSearchDuration : totalSearchDuration); `, // Combine script is executed per cluster, but we already have a key-value pair per cluster. // Despite docs that say this is optional, this script can't be blank. @@ -398,13 +403,24 @@ export async function getExecutionsPerDayCount( byRuleTypeId: ruleTypeExecutionsWithDurationMetric, failuresByReason: ruleTypeFailureExecutionsMetric, avgDuration: { avg: { field: 'event.duration' } }, + avgEsSearchDuration: { + avg: { field: 'kibana.alert.rule.execution.metrics.es_search_duration_ms' }, + }, + avgTotalSearchDuration: { + avg: { field: 'kibana.alert.rule.execution.metrics.total_search_duration_ms' }, + }, }, }, }); const executionsAggregations = searchResult.aggregations as { byRuleTypeId: { - value: { ruleTypes: Record; ruleTypesDuration: Record }; + value: { + ruleTypes: Record; + ruleTypesDuration: Record; + ruleTypesEsSearchDuration: Record; + ruleTypesTotalSearchDuration: Record; + }; }; }; @@ -414,6 +430,15 @@ export async function getExecutionsPerDayCount( searchResult.aggregations.avgDuration.value / (1000 * 1000) ); + const aggsAvgEsSearchDuration = Math.round( + // @ts-expect-error aggegation type is not specified + searchResult.aggregations.avgEsSearchDuration.value + ); + const aggsAvgTotalSearchDuration = Math.round( + // @ts-expect-error aggegation type is not specified + searchResult.aggregations.avgTotalSearchDuration.value + ); + const executionFailuresAggregations = searchResult.aggregations as { failuresByReason: { value: { reasons: Record> } }; }; @@ -482,6 +507,36 @@ export async function getExecutionsPerDayCount( }), {} ), + avgEsSearchDuration: aggsAvgEsSearchDuration, + avgEsSearchDurationByType: Object.keys( + executionsAggregations.byRuleTypeId.value.ruleTypes + ).reduce( + // ES DSL aggregations are returned as `any` by esClient.search + // eslint-disable-next-line @typescript-eslint/no-explicit-any + (obj: any, key: string) => ({ + ...obj, + [replaceDotSymbols(key)]: Math.round( + executionsAggregations.byRuleTypeId.value.ruleTypesEsSearchDuration[key] / + parseInt(executionsAggregations.byRuleTypeId.value.ruleTypes[key], 10) + ), + }), + {} + ), + avgTotalSearchDuration: aggsAvgTotalSearchDuration, + avgTotalSearchDurationByType: Object.keys( + executionsAggregations.byRuleTypeId.value.ruleTypes + ).reduce( + // ES DSL aggregations are returned as `any` by esClient.search + // eslint-disable-next-line @typescript-eslint/no-explicit-any + (obj: any, key: string) => ({ + ...obj, + [replaceDotSymbols(key)]: Math.round( + executionsAggregations.byRuleTypeId.value.ruleTypesTotalSearchDuration[key] / + parseInt(executionsAggregations.byRuleTypeId.value.ruleTypes[key], 10) + ), + }), + {} + ), }; } diff --git a/x-pack/plugins/alerting/server/usage/alerting_usage_collector.ts b/x-pack/plugins/alerting/server/usage/alerting_usage_collector.ts index 54e45497863814..f375e758a8c9b2 100644 --- a/x-pack/plugins/alerting/server/usage/alerting_usage_collector.ts +++ b/x-pack/plugins/alerting/server/usage/alerting_usage_collector.ts @@ -156,6 +156,10 @@ export function createAlertingUsageCollector( count_failed_and_unrecognized_rule_tasks_by_status_by_type_per_day: {}, avg_execution_time_per_day: 0, avg_execution_time_by_type_per_day: {}, + avg_es_search_duration_per_day: 0, + avg_es_search_duration_by_type_per_day: {}, + avg_total_search_duration_per_day: 0, + avg_total_search_duration_by_type_per_day: {}, }; } }, @@ -203,6 +207,10 @@ export function createAlertingUsageCollector( count_failed_and_unrecognized_rule_tasks_by_status_by_type_per_day: byTaskStatusSchemaByType, avg_execution_time_per_day: { type: 'long' }, avg_execution_time_by_type_per_day: byTypeSchema, + avg_es_search_duration_per_day: { type: 'long' }, + avg_es_search_duration_by_type_per_day: byTypeSchema, + avg_total_search_duration_per_day: { type: 'long' }, + avg_total_search_duration_by_type_per_day: byTypeSchema, }, }); } diff --git a/x-pack/plugins/alerting/server/usage/task.ts b/x-pack/plugins/alerting/server/usage/task.ts index 15978e9967ad21..7aee0436538060 100644 --- a/x-pack/plugins/alerting/server/usage/task.ts +++ b/x-pack/plugins/alerting/server/usage/task.ts @@ -138,6 +138,12 @@ export function telemetryTaskRunner( dailyFailedAndUnrecognizedTasks.countByStatusByRuleType, avg_execution_time_per_day: dailyExecutionCounts.avgExecutionTime, avg_execution_time_by_type_per_day: dailyExecutionCounts.avgExecutionTimeByType, + avg_es_search_duration_per_day: dailyExecutionCounts.avgEsSearchDuration, + avg_es_search_duration_by_type_per_day: + dailyExecutionCounts.avgEsSearchDurationByType, + avg_total_search_duration_per_day: dailyExecutionCounts.avgTotalSearchDuration, + avg_total_search_duration_by_type_per_day: + dailyExecutionCounts.avgTotalSearchDurationByType, }, runAt: getNextMidnight(), }; diff --git a/x-pack/plugins/alerting/server/usage/types.ts b/x-pack/plugins/alerting/server/usage/types.ts index ae951f5d659424..a03483bd54007d 100644 --- a/x-pack/plugins/alerting/server/usage/types.ts +++ b/x-pack/plugins/alerting/server/usage/types.ts @@ -27,6 +27,10 @@ export interface AlertingUsage { >; avg_execution_time_per_day: number; avg_execution_time_by_type_per_day: Record; + avg_es_search_duration_per_day: number; + avg_es_search_duration_by_type_per_day: Record; + avg_total_search_duration_per_day: number; + avg_total_search_duration_by_type_per_day: Record; throttle_time: { min: string; avg: string; diff --git a/x-pack/plugins/telemetry_collection_xpack/schema/xpack_plugins.json b/x-pack/plugins/telemetry_collection_xpack/schema/xpack_plugins.json index f8230be2f5908b..244599b3fc5e4f 100644 --- a/x-pack/plugins/telemetry_collection_xpack/schema/xpack_plugins.json +++ b/x-pack/plugins/telemetry_collection_xpack/schema/xpack_plugins.json @@ -1731,6 +1731,218 @@ "type": "long" } } + }, + "avg_es_search_duration_per_day": { + "type": "long" + }, + "avg_es_search_duration_by_type_per_day": { + "properties": { + "DYNAMIC_KEY": { + "type": "long" + }, + "__index-threshold": { + "type": "long" + }, + "__es-query": { + "type": "long" + }, + "transform_health": { + "type": "long" + }, + "apm__error_rate": { + "type": "long" + }, + "apm__transaction_error_rate": { + "type": "long" + }, + "apm__transaction_duration": { + "type": "long" + }, + "apm__transaction_duration_anomaly": { + "type": "long" + }, + "metrics__alert__threshold": { + "type": "long" + }, + "metrics__alert__inventory__threshold": { + "type": "long" + }, + "logs__alert__document__count": { + "type": "long" + }, + "monitoring_alert_cluster_health": { + "type": "long" + }, + "monitoring_alert_cpu_usage": { + "type": "long" + }, + "monitoring_alert_disk_usage": { + "type": "long" + }, + "monitoring_alert_elasticsearch_version_mismatch": { + "type": "long" + }, + "monitoring_alert_kibana_version_mismatch": { + "type": "long" + }, + "monitoring_alert_license_expiration": { + "type": "long" + }, + "monitoring_alert_logstash_version_mismatch": { + "type": "long" + }, + "monitoring_alert_nodes_changed": { + "type": "long" + }, + "siem__signals": { + "type": "long" + }, + "siem__notifications": { + "type": "long" + }, + "siem__eqlRule": { + "type": "long" + }, + "siem__indicatorRule": { + "type": "long" + }, + "siem__mlRule": { + "type": "long" + }, + "siem__queryRule": { + "type": "long" + }, + "siem__savedQueryRule": { + "type": "long" + }, + "siem__thresholdRule": { + "type": "long" + }, + "xpack__uptime__alerts__monitorStatus": { + "type": "long" + }, + "xpack__uptime__alerts__tls": { + "type": "long" + }, + "xpack__uptime__alerts__durationAnomaly": { + "type": "long" + }, + "__geo-containment": { + "type": "long" + }, + "xpack__ml__anomaly_detection_alert": { + "type": "long" + }, + "xpack__ml__anomaly_detection_jobs_health": { + "type": "long" + } + } + }, + "avg_total_search_duration_per_day": { + "type": "long" + }, + "avg_total_search_duration_by_type_per_day": { + "properties": { + "DYNAMIC_KEY": { + "type": "long" + }, + "__index-threshold": { + "type": "long" + }, + "__es-query": { + "type": "long" + }, + "transform_health": { + "type": "long" + }, + "apm__error_rate": { + "type": "long" + }, + "apm__transaction_error_rate": { + "type": "long" + }, + "apm__transaction_duration": { + "type": "long" + }, + "apm__transaction_duration_anomaly": { + "type": "long" + }, + "metrics__alert__threshold": { + "type": "long" + }, + "metrics__alert__inventory__threshold": { + "type": "long" + }, + "logs__alert__document__count": { + "type": "long" + }, + "monitoring_alert_cluster_health": { + "type": "long" + }, + "monitoring_alert_cpu_usage": { + "type": "long" + }, + "monitoring_alert_disk_usage": { + "type": "long" + }, + "monitoring_alert_elasticsearch_version_mismatch": { + "type": "long" + }, + "monitoring_alert_kibana_version_mismatch": { + "type": "long" + }, + "monitoring_alert_license_expiration": { + "type": "long" + }, + "monitoring_alert_logstash_version_mismatch": { + "type": "long" + }, + "monitoring_alert_nodes_changed": { + "type": "long" + }, + "siem__signals": { + "type": "long" + }, + "siem__notifications": { + "type": "long" + }, + "siem__eqlRule": { + "type": "long" + }, + "siem__indicatorRule": { + "type": "long" + }, + "siem__mlRule": { + "type": "long" + }, + "siem__queryRule": { + "type": "long" + }, + "siem__savedQueryRule": { + "type": "long" + }, + "siem__thresholdRule": { + "type": "long" + }, + "xpack__uptime__alerts__monitorStatus": { + "type": "long" + }, + "xpack__uptime__alerts__tls": { + "type": "long" + }, + "xpack__uptime__alerts__durationAnomaly": { + "type": "long" + }, + "__geo-containment": { + "type": "long" + }, + "xpack__ml__anomaly_detection_alert": { + "type": "long" + }, + "xpack__ml__anomaly_detection_jobs_health": { + "type": "long" + } + } } } }, diff --git a/x-pack/test/alerting_api_integration/security_and_spaces/tests/telemetry/alerting_telemetry.ts b/x-pack/test/alerting_api_integration/security_and_spaces/tests/telemetry/alerting_telemetry.ts index 9b8a96bc056cef..3b768b563b999e 100644 --- a/x-pack/test/alerting_api_integration/security_and_spaces/tests/telemetry/alerting_telemetry.ts +++ b/x-pack/test/alerting_api_integration/security_and_spaces/tests/telemetry/alerting_telemetry.ts @@ -13,6 +13,7 @@ import { getTestRuleData, ObjectRemover, TaskManagerDoc, + ESTestIndexTool, } from '../../../common/lib'; import { FtrProviderContext } from '../../../common/ftr_provider_context'; @@ -22,6 +23,7 @@ export default function createAlertingTelemetryTests({ getService }: FtrProvider const es = getService('es'); const retry = getService('retry'); const supertestWithoutAuth = getService('supertestWithoutAuth'); + const esTestIndexTool = new ESTestIndexTool(es, retry); describe('alerting telemetry', () => { const alwaysFiringRuleId: { [key: string]: string } = {}; @@ -43,6 +45,11 @@ export default function createAlertingTelemetryTests({ getService }: FtrProvider }); after(() => objectRemover.removeAll()); + beforeEach(async () => { + await esTestIndexTool.destroy(); + await esTestIndexTool.setup(); + }); + async function createConnector(opts: { name: string; space: string; connectorTypeId: string }) { const { name, space, connectorTypeId } = opts; const { body: createdConnector } = await supertestWithoutAuth @@ -178,6 +185,28 @@ export default function createAlertingTelemetryTests({ getService }: FtrProvider ], }, }); + + await createRule({ + space: space.id, + ruleOverwrites: { + rule_type_id: 'test.multipleSearches', + schedule: { interval: '29s' }, + throttle: '1m', + params: { numSearches: 2, delay: `2s` }, + actions: [ + { + id: noopConnectorId, + group: 'default', + params: {}, + }, + { + id: noopConnectorId, + group: 'default', + params: {}, + }, + ], + }, + }); } } @@ -192,7 +221,7 @@ export default function createAlertingTelemetryTests({ getService }: FtrProvider type: 'alert', id: alwaysFiringRuleId[Spaces[0].id], provider: 'alerting', - actions: new Map([['execute', { gte: 5 }]]), + actions: new Map([['execute', { gte: 8 }]]), }); }); @@ -213,10 +242,10 @@ export default function createAlertingTelemetryTests({ getService }: FtrProvider const telemetry = JSON.parse(taskState!); // total number of rules - expect(telemetry.count_total).to.equal(15); + expect(telemetry.count_total).to.equal(18); // total number of enabled rules - expect(telemetry.count_active_total).to.equal(12); + expect(telemetry.count_active_total).to.equal(15); // total number of disabled rules expect(telemetry.count_disabled_total).to.equal(3); @@ -226,32 +255,34 @@ export default function createAlertingTelemetryTests({ getService }: FtrProvider expect(telemetry.count_by_type['example__always-firing']).to.equal(3); expect(telemetry.count_by_type.test__throw).to.equal(3); expect(telemetry.count_by_type.test__noop).to.equal(6); + expect(telemetry.count_by_type.test__multipleSearches).to.equal(3); // total number of enabled rules broken down by rule type expect(telemetry.count_active_by_type.test__onlyContextVariables).to.equal(3); expect(telemetry.count_active_by_type['example__always-firing']).to.equal(3); expect(telemetry.count_active_by_type.test__throw).to.equal(3); expect(telemetry.count_active_by_type.test__noop).to.equal(3); + expect(telemetry.count_active_by_type.test__multipleSearches).to.equal(3); // throttle time stats expect(telemetry.throttle_time.min).to.equal('0s'); - expect(telemetry.throttle_time.avg).to.equal('157.75s'); + expect(telemetry.throttle_time.avg).to.equal('138.2s'); expect(telemetry.throttle_time.max).to.equal('600s'); expect(telemetry.throttle_time_number_s.min).to.equal(0); - expect(telemetry.throttle_time_number_s.avg).to.equal(157.75); + expect(telemetry.throttle_time_number_s.avg).to.equal(138.2); expect(telemetry.throttle_time_number_s.max).to.equal(600); // schedule interval stats expect(telemetry.schedule_time.min).to.equal('3s'); - expect(telemetry.schedule_time.avg).to.equal('80.6s'); + expect(telemetry.schedule_time.avg).to.equal('72s'); expect(telemetry.schedule_time.max).to.equal('300s'); expect(telemetry.schedule_time_number_s.min).to.equal(3); - expect(telemetry.schedule_time_number_s.avg).to.equal(80.6); + expect(telemetry.schedule_time_number_s.avg).to.equal(72); expect(telemetry.schedule_time_number_s.max).to.equal(300); // attached connectors stats expect(telemetry.connectors_per_alert.min).to.equal(1); - expect(telemetry.connectors_per_alert.avg).to.equal(1.4); + expect(telemetry.connectors_per_alert.avg).to.equal(1.5); expect(telemetry.connectors_per_alert.max).to.equal(3); // number of spaces with rules @@ -259,13 +290,14 @@ export default function createAlertingTelemetryTests({ getService }: FtrProvider // number of rule executions - just checking for non-zero as we can't set an exact number // each rule should have had a chance to execute once - expect(telemetry.count_rules_executions_per_day >= 15).to.be(true); + expect(telemetry.count_rules_executions_per_day >= 18).to.be(true); // number of rule executions broken down by rule type expect(telemetry.count_by_type.test__onlyContextVariables >= 3).to.be(true); expect(telemetry.count_by_type['example__always-firing'] >= 3).to.be(true); expect(telemetry.count_by_type.test__throw >= 3).to.be(true); expect(telemetry.count_by_type.test__noop >= 3).to.be(true); + expect(telemetry.count_by_type.test__multipleSearches >= 3).to.be(true); // average execution time - just checking for non-zero as we can't set an exact number expect(telemetry.avg_execution_time_per_day > 0).to.be(true); @@ -279,6 +311,43 @@ export default function createAlertingTelemetryTests({ getService }: FtrProvider ); expect(telemetry.avg_execution_time_by_type_per_day.test__throw > 0).to.be(true); expect(telemetry.avg_execution_time_by_type_per_day.test__noop > 0).to.be(true); + expect(telemetry.avg_execution_time_by_type_per_day.test__multipleSearches > 0).to.be(true); + + // average es search time - just checking for non-zero as we can't set an exact number + expect(telemetry.avg_es_search_duration_per_day > 0).to.be(true); + + // average es search time broken down by rule type, most of these rule types don't perform ES queries + expect( + telemetry.avg_es_search_duration_by_type_per_day.test__onlyContextVariables === 0 + ).to.be(true); + expect( + telemetry.avg_es_search_duration_by_type_per_day['example__always-firing'] === 0 + ).to.be(true); + expect(telemetry.avg_es_search_duration_by_type_per_day.test__throw === 0).to.be(true); + expect(telemetry.avg_es_search_duration_by_type_per_day.test__noop === 0).to.be(true); + + // rule type that performs ES search + expect(telemetry.avg_es_search_duration_by_type_per_day.test__multipleSearches > 0).to.be( + true + ); + + // average total search time time - just checking for non-zero as we can't set an exact number + expect(telemetry.avg_total_search_duration_per_day > 0).to.be(true); + + // average total search time broken down by rule type, most of these rule types don't perform ES queries + expect( + telemetry.avg_total_search_duration_by_type_per_day.test__onlyContextVariables === 0 + ).to.be(true); + expect( + telemetry.avg_total_search_duration_by_type_per_day['example__always-firing'] === 0 + ).to.be(true); + expect(telemetry.avg_total_search_duration_by_type_per_day.test__throw === 0).to.be(true); + expect(telemetry.avg_total_search_duration_by_type_per_day.test__noop === 0).to.be(true); + + // rule type that performs ES search + expect(telemetry.avg_total_search_duration_by_type_per_day.test__multipleSearches > 0).to.be( + true + ); // number of failed executions - we have one rule that always fails expect(telemetry.count_rules_executions_failured_per_day >= 1).to.be(true);