Skip to content

Commit

Permalink
[Alerting] Add telemetry for query/search durations during rule execu…
Browse files Browse the repository at this point in the history
…tion (#128299)

* Updating types with new telemetry fields

* Updating functional tests

Co-authored-by: Kibana Machine <42973632+kibanamachine@users.noreply.github.com>
  • Loading branch information
ymao1 and kibanamachine committed Mar 24, 2022
1 parent 2b4d721 commit 04691d7
Show file tree
Hide file tree
Showing 7 changed files with 393 additions and 11 deletions.
28 changes: 28 additions & 0 deletions x-pack/plugins/alerting/server/usage/alerting_telemetry.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -151,6 +151,16 @@ Object {
'logs.alert.document.count': 1675765,
'document.test.': 17687687,
},
ruleTypesEsSearchDuration: {
'.index-threshold': 23,
'logs.alert.document.count': 526,
'document.test.': 534,
},
ruleTypesTotalSearchDuration: {
'.index-threshold': 62,
'logs.alert.document.count': 588,
'document.test.': 637,
},
},
},
failuresByReason: {
Expand All @@ -165,6 +175,12 @@ Object {
},
},
avgDuration: { value: 10 },
avgEsSearchDuration: {
value: 25.785714285714285,
},
avgTotalSearchDuration: {
value: 30.642857142857142,
},
},
hits: {
hits: [],
Expand All @@ -177,12 +193,24 @@ Object {
expect(mockEsClient.search).toHaveBeenCalledTimes(1);

expect(telemetry).toStrictEqual({
avgEsSearchDuration: 26,
avgEsSearchDurationByType: {
'__index-threshold': 12,
document__test__: 534,
logs__alert__document__count: 526,
},
avgExecutionTime: 0,
avgExecutionTimeByType: {
'__index-threshold': 1043934,
document__test__: 17687687,
logs__alert__document__count: 1675765,
},
avgTotalSearchDuration: 31,
avgTotalSearchDurationByType: {
'__index-threshold': 31,
document__test__: 637,
logs__alert__document__count: 588,
},
countByType: {
'__index-threshold': 2,
document__test__: 1,
Expand Down
59 changes: 57 additions & 2 deletions x-pack/plugins/alerting/server/usage/alerting_telemetry.ts
Original file line number Diff line number Diff line change
Expand Up @@ -40,12 +40,17 @@ const ruleTypeMetric = {

const ruleTypeExecutionsWithDurationMetric = {
scripted_metric: {
init_script: 'state.ruleTypes = [:]; state.ruleTypesDuration = [:];',
init_script:
'state.ruleTypes = [:]; state.ruleTypesDuration = [:]; state.ruleTypesEsSearchDuration = [:]; state.ruleTypesTotalSearchDuration = [:];',
map_script: `
String ruleType = doc['rule.category'].value;
long duration = doc['event.duration'].value / (1000 * 1000);
long esSearchDuration = doc['kibana.alert.rule.execution.metrics.es_search_duration_ms'].empty ? 0 : doc['kibana.alert.rule.execution.metrics.es_search_duration_ms'].value;
long totalSearchDuration = doc['kibana.alert.rule.execution.metrics.total_search_duration_ms'].empty ? 0 : doc['kibana.alert.rule.execution.metrics.total_search_duration_ms'].value;
state.ruleTypes.put(ruleType, state.ruleTypes.containsKey(ruleType) ? state.ruleTypes.get(ruleType) + 1 : 1);
state.ruleTypesDuration.put(ruleType, state.ruleTypesDuration.containsKey(ruleType) ? state.ruleTypesDuration.get(ruleType) + duration : duration);
state.ruleTypesEsSearchDuration.put(ruleType, state.ruleTypesEsSearchDuration.containsKey(ruleType) ? state.ruleTypesEsSearchDuration.get(ruleType) + esSearchDuration : esSearchDuration);
state.ruleTypesTotalSearchDuration.put(ruleType, state.ruleTypesTotalSearchDuration.containsKey(ruleType) ? state.ruleTypesTotalSearchDuration.get(ruleType) + totalSearchDuration : totalSearchDuration);
`,
// Combine script is executed per cluster, but we already have a key-value pair per cluster.
// Despite docs that say this is optional, this script can't be blank.
Expand Down Expand Up @@ -398,13 +403,24 @@ export async function getExecutionsPerDayCount(
byRuleTypeId: ruleTypeExecutionsWithDurationMetric,
failuresByReason: ruleTypeFailureExecutionsMetric,
avgDuration: { avg: { field: 'event.duration' } },
avgEsSearchDuration: {
avg: { field: 'kibana.alert.rule.execution.metrics.es_search_duration_ms' },
},
avgTotalSearchDuration: {
avg: { field: 'kibana.alert.rule.execution.metrics.total_search_duration_ms' },
},
},
},
});

const executionsAggregations = searchResult.aggregations as {
byRuleTypeId: {
value: { ruleTypes: Record<string, string>; ruleTypesDuration: Record<string, number> };
value: {
ruleTypes: Record<string, string>;
ruleTypesDuration: Record<string, number>;
ruleTypesEsSearchDuration: Record<string, number>;
ruleTypesTotalSearchDuration: Record<string, number>;
};
};
};

Expand All @@ -414,6 +430,15 @@ export async function getExecutionsPerDayCount(
searchResult.aggregations.avgDuration.value / (1000 * 1000)
);

const aggsAvgEsSearchDuration = Math.round(
// @ts-expect-error aggegation type is not specified
searchResult.aggregations.avgEsSearchDuration.value
);
const aggsAvgTotalSearchDuration = Math.round(
// @ts-expect-error aggegation type is not specified
searchResult.aggregations.avgTotalSearchDuration.value
);

const executionFailuresAggregations = searchResult.aggregations as {
failuresByReason: { value: { reasons: Record<string, Record<string, string>> } };
};
Expand Down Expand Up @@ -482,6 +507,36 @@ export async function getExecutionsPerDayCount(
}),
{}
),
avgEsSearchDuration: aggsAvgEsSearchDuration,
avgEsSearchDurationByType: Object.keys(
executionsAggregations.byRuleTypeId.value.ruleTypes
).reduce(
// ES DSL aggregations are returned as `any` by esClient.search
// eslint-disable-next-line @typescript-eslint/no-explicit-any
(obj: any, key: string) => ({
...obj,
[replaceDotSymbols(key)]: Math.round(
executionsAggregations.byRuleTypeId.value.ruleTypesEsSearchDuration[key] /
parseInt(executionsAggregations.byRuleTypeId.value.ruleTypes[key], 10)
),
}),
{}
),
avgTotalSearchDuration: aggsAvgTotalSearchDuration,
avgTotalSearchDurationByType: Object.keys(
executionsAggregations.byRuleTypeId.value.ruleTypes
).reduce(
// ES DSL aggregations are returned as `any` by esClient.search
// eslint-disable-next-line @typescript-eslint/no-explicit-any
(obj: any, key: string) => ({
...obj,
[replaceDotSymbols(key)]: Math.round(
executionsAggregations.byRuleTypeId.value.ruleTypesTotalSearchDuration[key] /
parseInt(executionsAggregations.byRuleTypeId.value.ruleTypes[key], 10)
),
}),
{}
),
};
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -156,6 +156,10 @@ export function createAlertingUsageCollector(
count_failed_and_unrecognized_rule_tasks_by_status_by_type_per_day: {},
avg_execution_time_per_day: 0,
avg_execution_time_by_type_per_day: {},
avg_es_search_duration_per_day: 0,
avg_es_search_duration_by_type_per_day: {},
avg_total_search_duration_per_day: 0,
avg_total_search_duration_by_type_per_day: {},
};
}
},
Expand Down Expand Up @@ -203,6 +207,10 @@ export function createAlertingUsageCollector(
count_failed_and_unrecognized_rule_tasks_by_status_by_type_per_day: byTaskStatusSchemaByType,
avg_execution_time_per_day: { type: 'long' },
avg_execution_time_by_type_per_day: byTypeSchema,
avg_es_search_duration_per_day: { type: 'long' },
avg_es_search_duration_by_type_per_day: byTypeSchema,
avg_total_search_duration_per_day: { type: 'long' },
avg_total_search_duration_by_type_per_day: byTypeSchema,
},
});
}
Expand Down
6 changes: 6 additions & 0 deletions x-pack/plugins/alerting/server/usage/task.ts
Original file line number Diff line number Diff line change
Expand Up @@ -138,6 +138,12 @@ export function telemetryTaskRunner(
dailyFailedAndUnrecognizedTasks.countByStatusByRuleType,
avg_execution_time_per_day: dailyExecutionCounts.avgExecutionTime,
avg_execution_time_by_type_per_day: dailyExecutionCounts.avgExecutionTimeByType,
avg_es_search_duration_per_day: dailyExecutionCounts.avgEsSearchDuration,
avg_es_search_duration_by_type_per_day:
dailyExecutionCounts.avgEsSearchDurationByType,
avg_total_search_duration_per_day: dailyExecutionCounts.avgTotalSearchDuration,
avg_total_search_duration_by_type_per_day:
dailyExecutionCounts.avgTotalSearchDurationByType,
},
runAt: getNextMidnight(),
};
Expand Down
4 changes: 4 additions & 0 deletions x-pack/plugins/alerting/server/usage/types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,10 @@ export interface AlertingUsage {
>;
avg_execution_time_per_day: number;
avg_execution_time_by_type_per_day: Record<string, number>;
avg_es_search_duration_per_day: number;
avg_es_search_duration_by_type_per_day: Record<string, number>;
avg_total_search_duration_per_day: number;
avg_total_search_duration_by_type_per_day: Record<string, number>;
throttle_time: {
min: string;
avg: string;
Expand Down

0 comments on commit 04691d7

Please sign in to comment.