Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[ML] Check for error messages in the Anomaly Detection jobs health rule type #108701

Merged
merged 14 commits into from Aug 17, 2021
Merged
4 changes: 2 additions & 2 deletions x-pack/plugins/ml/common/constants/alerts.ts
Expand Up @@ -54,12 +54,12 @@ export const HEALTH_CHECK_NAMES: Record<JobsHealthTests, { name: string; descrip
},
errorMessages: {
name: i18n.translate('xpack.ml.alertTypes.jobsHealthAlertingRule.errorMessagesCheckName', {
defaultMessage: 'There are errors in the job messages',
defaultMessage: 'Errors in job messages',
}),
description: i18n.translate(
'xpack.ml.alertTypes.jobsHealthAlertingRule.errorMessagesCheckDescription',
{
defaultMessage: 'There are errors in the job messages',
defaultMessage: 'Get alerted if a job contains errors in the job messages.',
}
),
},
Expand Down
6 changes: 6 additions & 0 deletions x-pack/plugins/ml/common/util/alerts.test.ts
Expand Up @@ -95,6 +95,9 @@ describe('getResultJobsHealthRuleConfig', () => {
enabled: true,
timeInterval: null,
},
errorMessages: {
enabled: true,
},
});
});
test('returns config with overridden values based on provided configuration', () => {
Expand All @@ -119,6 +122,9 @@ describe('getResultJobsHealthRuleConfig', () => {
enabled: true,
timeInterval: null,
},
errorMessages: {
enabled: true,
},
});
});
});
2 changes: 1 addition & 1 deletion x-pack/plugins/ml/common/util/alerts.ts
Expand Up @@ -54,7 +54,7 @@ export function getTopNBuckets(job: Job): number {
return Math.ceil(narrowBucketLength / bucketSpan.asSeconds());
}

const implementedTests = ['datafeed', 'mml', 'delayedData'] as JobsHealthTests[];
const implementedTests = ['datafeed', 'mml', 'delayedData', 'errorMessages'] as JobsHealthTests[];

/**
* Returns tests configuration combined with default values.
Expand Down
Expand Up @@ -21,7 +21,8 @@ export function registerJobsHealthAlertingRule(
triggersActionsUi.ruleTypeRegistry.register({
id: ML_ALERT_TYPES.AD_JOBS_HEALTH,
description: i18n.translate('xpack.ml.alertTypes.jobsHealthAlertingRule.description', {
defaultMessage: 'Alert when anomaly detection jobs experience operational issues.',
defaultMessage:
'Alert when anomaly detection jobs experience operational issues. Enable suitable alerts for critically important jobs.',
}),
iconClass: 'bell',
documentationUrl(docLinks) {
Expand Down Expand Up @@ -90,14 +91,15 @@ export function registerJobsHealthAlertingRule(
\\{\\{context.message\\}\\}
\\{\\{#context.results\\}\\}
Job ID: \\{\\{job_id\\}\\}
\\{\\{#datafeed_id\\}\\}Datafeed ID: \\{\\{datafeed_id\\}\\} \\{\\{/datafeed_id\\}\\}
\\{\\{#datafeed_state\\}\\}Datafeed state: \\{\\{datafeed_state\\}\\} \\{\\{/datafeed_state\\}\\}
\\{\\{#memory_status\\}\\}Memory status: \\{\\{memory_status\\}\\} \\{\\{/memory_status\\}\\}
\\{\\{#log_time\\}\\}Memory logging time: \\{\\{log_time\\}\\} \\{\\{/log_time\\}\\}
\\{\\{#failed_category_count\\}\\}Failed category count: \\{\\{failed_category_count\\}\\} \\{\\{/failed_category_count\\}\\}
\\{\\{#annotation\\}\\}Annotation: \\{\\{annotation\\}\\} \\{\\{/annotation\\}\\}
\\{\\{#missed_docs_count\\}\\}Number of missed documents: \\{\\{missed_docs_count\\}\\} \\{\\{/missed_docs_count\\}\\}
\\{\\{#end_timestamp\\}\\}Latest finalized bucket with missing docs: \\{\\{end_timestamp\\}\\} \\{\\{/end_timestamp\\}\\}
\\{\\{#datafeed_id\\}\\}Datafeed ID: \\{\\{datafeed_id\\}\\}
\\{\\{/datafeed_id\\}\\} \\{\\{#datafeed_state\\}\\}Datafeed state: \\{\\{datafeed_state\\}\\}
\\{\\{/datafeed_state\\}\\} \\{\\{#memory_status\\}\\}Memory status: \\{\\{memory_status\\}\\}
\\{\\{/memory_status\\}\\} \\{\\{#log_time\\}\\}Memory logging time: \\{\\{log_time\\}\\}
\\{\\{/log_time\\}\\} \\{\\{#failed_category_count\\}\\}Failed category count: \\{\\{failed_category_count\\}\\}
\\{\\{/failed_category_count\\}\\} \\{\\{#annotation\\}\\}Annotation: \\{\\{annotation\\}\\}
\\{\\{/annotation\\}\\} \\{\\{#missed_docs_count\\}\\}Number of missed documents: \\{\\{missed_docs_count\\}\\}
\\{\\{/missed_docs_count\\}\\} \\{\\{#end_timestamp\\}\\}Latest finalized bucket with missing docs: \\{\\{end_timestamp\\}\\}
\\{\\{/end_timestamp\\}\\} \\{\\{#errors\\}\\}Error message: \\{\\{message\\}\\} \\{\\{/errors\\}\\}
\\{\\{/context.results\\}\\}
`,
}
Expand Down
177 changes: 117 additions & 60 deletions x-pack/plugins/ml/server/lib/alerts/jobs_health_service.test.ts
Expand Up @@ -11,9 +11,39 @@ import type { Logger } from 'kibana/server';
import { MlClient } from '../ml_client';
import { MlJob, MlJobStats } from '@elastic/elasticsearch/api/types';
import { AnnotationService } from '../../models/annotation_service/annotation';
import { JobsHealthExecutorOptions } from './register_jobs_monitoring_rule_type';
import { JobAuditMessagesService } from '../../models/job_audit_messages/job_audit_messages';
import { DeepPartial } from '../../../common/types/common';

const MOCK_DATE_NOW = 1487076708000;

function getDefaultExecutorOptions(
overrides: DeepPartial<JobsHealthExecutorOptions> = {}
): JobsHealthExecutorOptions {
return ({
state: {},
startedAt: new Date('2021-08-12T13:13:39.396Z'),
previousStartedAt: new Date('2021-08-12T13:13:27.396Z'),
spaceId: 'default',
namespace: undefined,
name: 'ml-health-check',
tags: [],
createdBy: 'elastic',
updatedBy: 'elastic',
rule: {
name: 'ml-health-check',
tags: [],
consumer: 'alerts',
producer: 'ml',
ruleTypeId: 'xpack.ml.anomaly_detection_jobs_health',
ruleTypeName: 'Anomaly detection jobs health',
enabled: true,
schedule: { interval: '10s' },
},
...overrides,
} as unknown) as JobsHealthExecutorOptions;
}

describe('JobsHealthService', () => {
const mlClient = ({
getJobs: jest.fn().mockImplementation(({ job_id: jobIds = [] }) => {
Expand Down Expand Up @@ -117,6 +147,12 @@ describe('JobsHealthService', () => {
}),
} as unknown) as jest.Mocked<AnnotationService>;

const jobAuditMessagesService = ({
getJobsErrors: jest.fn().mockImplementation((jobIds: string) => {
return Promise.resolve({});
}),
} as unknown) as jest.Mocked<JobAuditMessagesService>;

const logger = ({
warn: jest.fn(),
info: jest.fn(),
Expand All @@ -127,6 +163,7 @@ describe('JobsHealthService', () => {
mlClient,
datafeedsService,
annotationService,
jobAuditMessagesService,
logger
);

Expand All @@ -143,70 +180,85 @@ describe('JobsHealthService', () => {

test('returns empty results when no jobs provided', async () => {
// act
const executionResult = await jobHealthService.getTestsResults('testRule', {
testsConfig: null,
includeJobs: {
jobIds: ['*'],
groupIds: [],
},
excludeJobs: null,
});
const executionResult = await jobHealthService.getTestsResults(
getDefaultExecutorOptions({
rule: { name: 'testRule' },
params: {
testsConfig: null,
includeJobs: {
jobIds: ['*'],
groupIds: [],
},
excludeJobs: null,
},
})
);
expect(logger.warn).toHaveBeenCalledWith('Rule "testRule" does not have associated jobs.');
expect(datafeedsService.getDatafeedByJobId).not.toHaveBeenCalled();
expect(executionResult).toEqual([]);
});

test('returns empty results and does not perform datafeed check when test is disabled', async () => {
const executionResult = await jobHealthService.getTestsResults('testRule', {
testsConfig: {
datafeed: {
enabled: false,
},
behindRealtime: null,
delayedData: {
enabled: false,
docsCount: null,
timeInterval: null,
},
errorMessages: null,
mml: {
enabled: false,
const executionResult = await jobHealthService.getTestsResults(
getDefaultExecutorOptions({
rule: { name: 'testRule' },
params: {
testsConfig: {
datafeed: {
enabled: false,
},
behindRealtime: null,
delayedData: {
enabled: false,
docsCount: null,
timeInterval: null,
},
errorMessages: null,
mml: {
enabled: false,
},
},
includeJobs: {
jobIds: ['test_job_01'],
groupIds: [],
},
excludeJobs: null,
},
},
includeJobs: {
jobIds: ['test_job_01'],
groupIds: [],
},
excludeJobs: null,
});
})
);
expect(logger.warn).not.toHaveBeenCalled();
expect(logger.debug).toHaveBeenCalledWith(`Performing health checks for job IDs: test_job_01`);
expect(datafeedsService.getDatafeedByJobId).not.toHaveBeenCalled();
expect(executionResult).toEqual([]);
});

test('takes into account delayed data params', async () => {
const executionResult = await jobHealthService.getTestsResults('testRule_04', {
testsConfig: {
delayedData: {
enabled: true,
docsCount: 10,
timeInterval: '4h',
const executionResult = await jobHealthService.getTestsResults(
getDefaultExecutorOptions({
rule: { name: 'testRule_04' },
params: {
testsConfig: {
delayedData: {
enabled: true,
docsCount: 10,
timeInterval: '4h',
},
behindRealtime: { enabled: false, timeInterval: null },
mml: { enabled: false },
datafeed: { enabled: false },
errorMessages: { enabled: false },
},
includeJobs: {
jobIds: [],
groupIds: ['test_group'],
},
excludeJobs: {
jobIds: ['test_job_03'],
groupIds: [],
},
},
behindRealtime: { enabled: false, timeInterval: null },
mml: { enabled: false },
datafeed: { enabled: false },
errorMessages: { enabled: false },
},
includeJobs: {
jobIds: [],
groupIds: ['test_group'],
},
excludeJobs: {
jobIds: ['test_job_03'],
groupIds: [],
},
});
})
);

expect(annotationService.getDelayedDataAnnotations).toHaveBeenCalledWith({
jobIds: ['test_job_01', 'test_job_02'],
Expand Down Expand Up @@ -234,17 +286,22 @@ describe('JobsHealthService', () => {
});

test('returns results based on provided selection', async () => {
const executionResult = await jobHealthService.getTestsResults('testRule_03', {
testsConfig: null,
includeJobs: {
jobIds: [],
groupIds: ['test_group'],
},
excludeJobs: {
jobIds: ['test_job_03'],
groupIds: [],
},
});
const executionResult = await jobHealthService.getTestsResults(
getDefaultExecutorOptions({
rule: { name: 'testRule_03' },
params: {
testsConfig: null,
includeJobs: {
jobIds: [],
groupIds: ['test_group'],
},
excludeJobs: {
jobIds: ['test_job_03'],
groupIds: [],
},
},
})
);
expect(logger.warn).not.toHaveBeenCalled();
expect(logger.debug).toHaveBeenCalledWith(
`Performing health checks for job IDs: test_job_01, test_job_02`
Expand Down