Skip to content

Commit

Permalink
[ML] Check for error messages in the Anomaly Detection jobs health ru…
Browse files Browse the repository at this point in the history
…le type (#108701)

* [ML] retrieve job errors

* [ML] account for previous execution time

* [ML] update default message

* [ML] update description

* [ML] update unit tests

* [ML] update unit tests

* [ML] update action name

* [ML] update errorMessages name

* [ML] update a default message to avoid line breaks

* [ML] update rule helper text

* [ML] refactor getJobsErrors

* [ML] perform errors check starting from the second execution
  • Loading branch information
darnautov authored and kibanamachine committed Aug 17, 2021
1 parent 042795e commit 461a8cd
Show file tree
Hide file tree
Showing 8 changed files with 268 additions and 86 deletions.
4 changes: 2 additions & 2 deletions x-pack/plugins/ml/common/constants/alerts.ts
Original file line number Diff line number Diff line change
Expand Up @@ -54,12 +54,12 @@ export const HEALTH_CHECK_NAMES: Record<JobsHealthTests, { name: string; descrip
},
errorMessages: {
name: i18n.translate('xpack.ml.alertTypes.jobsHealthAlertingRule.errorMessagesCheckName', {
defaultMessage: 'There are errors in the job messages',
defaultMessage: 'Errors in job messages',
}),
description: i18n.translate(
'xpack.ml.alertTypes.jobsHealthAlertingRule.errorMessagesCheckDescription',
{
defaultMessage: 'There are errors in the job messages',
defaultMessage: 'Get alerted if a job contains errors in the job messages.',
}
),
},
Expand Down
6 changes: 6 additions & 0 deletions x-pack/plugins/ml/common/util/alerts.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,9 @@ describe('getResultJobsHealthRuleConfig', () => {
enabled: true,
timeInterval: null,
},
errorMessages: {
enabled: true,
},
});
});
test('returns config with overridden values based on provided configuration', () => {
Expand All @@ -119,6 +122,9 @@ describe('getResultJobsHealthRuleConfig', () => {
enabled: true,
timeInterval: null,
},
errorMessages: {
enabled: true,
},
});
});
});
2 changes: 1 addition & 1 deletion x-pack/plugins/ml/common/util/alerts.ts
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ export function getTopNBuckets(job: Job): number {
return Math.ceil(narrowBucketLength / bucketSpan.asSeconds());
}

const implementedTests = ['datafeed', 'mml', 'delayedData'] as JobsHealthTests[];
const implementedTests = ['datafeed', 'mml', 'delayedData', 'errorMessages'] as JobsHealthTests[];

/**
* Returns tests configuration combined with default values.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,8 @@ export function registerJobsHealthAlertingRule(
triggersActionsUi.ruleTypeRegistry.register({
id: ML_ALERT_TYPES.AD_JOBS_HEALTH,
description: i18n.translate('xpack.ml.alertTypes.jobsHealthAlertingRule.description', {
defaultMessage: 'Alert when anomaly detection jobs experience operational issues.',
defaultMessage:
'Alert when anomaly detection jobs experience operational issues. Enable suitable alerts for critically important jobs.',
}),
iconClass: 'bell',
documentationUrl(docLinks) {
Expand Down Expand Up @@ -90,14 +91,15 @@ export function registerJobsHealthAlertingRule(
\\{\\{context.message\\}\\}
\\{\\{#context.results\\}\\}
Job ID: \\{\\{job_id\\}\\}
\\{\\{#datafeed_id\\}\\}Datafeed ID: \\{\\{datafeed_id\\}\\} \\{\\{/datafeed_id\\}\\}
\\{\\{#datafeed_state\\}\\}Datafeed state: \\{\\{datafeed_state\\}\\} \\{\\{/datafeed_state\\}\\}
\\{\\{#memory_status\\}\\}Memory status: \\{\\{memory_status\\}\\} \\{\\{/memory_status\\}\\}
\\{\\{#log_time\\}\\}Memory logging time: \\{\\{log_time\\}\\} \\{\\{/log_time\\}\\}
\\{\\{#failed_category_count\\}\\}Failed category count: \\{\\{failed_category_count\\}\\} \\{\\{/failed_category_count\\}\\}
\\{\\{#annotation\\}\\}Annotation: \\{\\{annotation\\}\\} \\{\\{/annotation\\}\\}
\\{\\{#missed_docs_count\\}\\}Number of missed documents: \\{\\{missed_docs_count\\}\\} \\{\\{/missed_docs_count\\}\\}
\\{\\{#end_timestamp\\}\\}Latest finalized bucket with missing docs: \\{\\{end_timestamp\\}\\} \\{\\{/end_timestamp\\}\\}
\\{\\{#datafeed_id\\}\\}Datafeed ID: \\{\\{datafeed_id\\}\\}
\\{\\{/datafeed_id\\}\\} \\{\\{#datafeed_state\\}\\}Datafeed state: \\{\\{datafeed_state\\}\\}
\\{\\{/datafeed_state\\}\\} \\{\\{#memory_status\\}\\}Memory status: \\{\\{memory_status\\}\\}
\\{\\{/memory_status\\}\\} \\{\\{#log_time\\}\\}Memory logging time: \\{\\{log_time\\}\\}
\\{\\{/log_time\\}\\} \\{\\{#failed_category_count\\}\\}Failed category count: \\{\\{failed_category_count\\}\\}
\\{\\{/failed_category_count\\}\\} \\{\\{#annotation\\}\\}Annotation: \\{\\{annotation\\}\\}
\\{\\{/annotation\\}\\} \\{\\{#missed_docs_count\\}\\}Number of missed documents: \\{\\{missed_docs_count\\}\\}
\\{\\{/missed_docs_count\\}\\} \\{\\{#end_timestamp\\}\\}Latest finalized bucket with missing docs: \\{\\{end_timestamp\\}\\}
\\{\\{/end_timestamp\\}\\} \\{\\{#errors\\}\\}Error message: \\{\\{message\\}\\} \\{\\{/errors\\}\\}
\\{\\{/context.results\\}\\}
`,
}
Expand Down
177 changes: 117 additions & 60 deletions x-pack/plugins/ml/server/lib/alerts/jobs_health_service.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,39 @@ import type { Logger } from 'kibana/server';
import { MlClient } from '../ml_client';
import { MlJob, MlJobStats } from '@elastic/elasticsearch/api/types';
import { AnnotationService } from '../../models/annotation_service/annotation';
import { JobsHealthExecutorOptions } from './register_jobs_monitoring_rule_type';
import { JobAuditMessagesService } from '../../models/job_audit_messages/job_audit_messages';
import { DeepPartial } from '../../../common/types/common';

const MOCK_DATE_NOW = 1487076708000;

function getDefaultExecutorOptions(
overrides: DeepPartial<JobsHealthExecutorOptions> = {}
): JobsHealthExecutorOptions {
return ({
state: {},
startedAt: new Date('2021-08-12T13:13:39.396Z'),
previousStartedAt: new Date('2021-08-12T13:13:27.396Z'),
spaceId: 'default',
namespace: undefined,
name: 'ml-health-check',
tags: [],
createdBy: 'elastic',
updatedBy: 'elastic',
rule: {
name: 'ml-health-check',
tags: [],
consumer: 'alerts',
producer: 'ml',
ruleTypeId: 'xpack.ml.anomaly_detection_jobs_health',
ruleTypeName: 'Anomaly detection jobs health',
enabled: true,
schedule: { interval: '10s' },
},
...overrides,
} as unknown) as JobsHealthExecutorOptions;
}

describe('JobsHealthService', () => {
const mlClient = ({
getJobs: jest.fn().mockImplementation(({ job_id: jobIds = [] }) => {
Expand Down Expand Up @@ -117,6 +147,12 @@ describe('JobsHealthService', () => {
}),
} as unknown) as jest.Mocked<AnnotationService>;

const jobAuditMessagesService = ({
getJobsErrors: jest.fn().mockImplementation((jobIds: string) => {
return Promise.resolve({});
}),
} as unknown) as jest.Mocked<JobAuditMessagesService>;

const logger = ({
warn: jest.fn(),
info: jest.fn(),
Expand All @@ -127,6 +163,7 @@ describe('JobsHealthService', () => {
mlClient,
datafeedsService,
annotationService,
jobAuditMessagesService,
logger
);

Expand All @@ -143,70 +180,85 @@ describe('JobsHealthService', () => {

test('returns empty results when no jobs provided', async () => {
// act
const executionResult = await jobHealthService.getTestsResults('testRule', {
testsConfig: null,
includeJobs: {
jobIds: ['*'],
groupIds: [],
},
excludeJobs: null,
});
const executionResult = await jobHealthService.getTestsResults(
getDefaultExecutorOptions({
rule: { name: 'testRule' },
params: {
testsConfig: null,
includeJobs: {
jobIds: ['*'],
groupIds: [],
},
excludeJobs: null,
},
})
);
expect(logger.warn).toHaveBeenCalledWith('Rule "testRule" does not have associated jobs.');
expect(datafeedsService.getDatafeedByJobId).not.toHaveBeenCalled();
expect(executionResult).toEqual([]);
});

test('returns empty results and does not perform datafeed check when test is disabled', async () => {
const executionResult = await jobHealthService.getTestsResults('testRule', {
testsConfig: {
datafeed: {
enabled: false,
},
behindRealtime: null,
delayedData: {
enabled: false,
docsCount: null,
timeInterval: null,
},
errorMessages: null,
mml: {
enabled: false,
const executionResult = await jobHealthService.getTestsResults(
getDefaultExecutorOptions({
rule: { name: 'testRule' },
params: {
testsConfig: {
datafeed: {
enabled: false,
},
behindRealtime: null,
delayedData: {
enabled: false,
docsCount: null,
timeInterval: null,
},
errorMessages: null,
mml: {
enabled: false,
},
},
includeJobs: {
jobIds: ['test_job_01'],
groupIds: [],
},
excludeJobs: null,
},
},
includeJobs: {
jobIds: ['test_job_01'],
groupIds: [],
},
excludeJobs: null,
});
})
);
expect(logger.warn).not.toHaveBeenCalled();
expect(logger.debug).toHaveBeenCalledWith(`Performing health checks for job IDs: test_job_01`);
expect(datafeedsService.getDatafeedByJobId).not.toHaveBeenCalled();
expect(executionResult).toEqual([]);
});

test('takes into account delayed data params', async () => {
const executionResult = await jobHealthService.getTestsResults('testRule_04', {
testsConfig: {
delayedData: {
enabled: true,
docsCount: 10,
timeInterval: '4h',
const executionResult = await jobHealthService.getTestsResults(
getDefaultExecutorOptions({
rule: { name: 'testRule_04' },
params: {
testsConfig: {
delayedData: {
enabled: true,
docsCount: 10,
timeInterval: '4h',
},
behindRealtime: { enabled: false, timeInterval: null },
mml: { enabled: false },
datafeed: { enabled: false },
errorMessages: { enabled: false },
},
includeJobs: {
jobIds: [],
groupIds: ['test_group'],
},
excludeJobs: {
jobIds: ['test_job_03'],
groupIds: [],
},
},
behindRealtime: { enabled: false, timeInterval: null },
mml: { enabled: false },
datafeed: { enabled: false },
errorMessages: { enabled: false },
},
includeJobs: {
jobIds: [],
groupIds: ['test_group'],
},
excludeJobs: {
jobIds: ['test_job_03'],
groupIds: [],
},
});
})
);

expect(annotationService.getDelayedDataAnnotations).toHaveBeenCalledWith({
jobIds: ['test_job_01', 'test_job_02'],
Expand Down Expand Up @@ -234,17 +286,22 @@ describe('JobsHealthService', () => {
});

test('returns results based on provided selection', async () => {
const executionResult = await jobHealthService.getTestsResults('testRule_03', {
testsConfig: null,
includeJobs: {
jobIds: [],
groupIds: ['test_group'],
},
excludeJobs: {
jobIds: ['test_job_03'],
groupIds: [],
},
});
const executionResult = await jobHealthService.getTestsResults(
getDefaultExecutorOptions({
rule: { name: 'testRule_03' },
params: {
testsConfig: null,
includeJobs: {
jobIds: [],
groupIds: ['test_group'],
},
excludeJobs: {
jobIds: ['test_job_03'],
groupIds: [],
},
},
})
);
expect(logger.warn).not.toHaveBeenCalled();
expect(logger.debug).toHaveBeenCalledWith(
`Performing health checks for job IDs: test_job_01, test_job_02`
Expand Down

0 comments on commit 461a8cd

Please sign in to comment.