Skip to content

Commit

Permalink
[ML] Creating category validation package (#161261)
Browse files Browse the repository at this point in the history
Moves the server and client side code which performs analysis on data to
see whether it is suitable for categorization.
This is currently only used by the categorization job wizard to display
this callout:

![image](https://github.com/elastic/kibana/assets/22172091/08db5321-0c38-474d-9bfe-90b8a9ad984a)

However this analysis will be useful for the Log Pattern Analysis
feature and so moving the code to a package allows easier sharing
between ML and AIOPs plugins.

---------

Co-authored-by: kibanamachine <42973632+kibanamachine@users.noreply.github.com>
  • Loading branch information
jgowdyelastic and kibanamachine committed Jul 19, 2023
1 parent 85c85e9 commit 219426d
Show file tree
Hide file tree
Showing 36 changed files with 354 additions and 126 deletions.
1 change: 1 addition & 0 deletions .github/CODEOWNERS
Validating CODEOWNERS rules …
Original file line number Diff line number Diff line change
Expand Up @@ -484,6 +484,7 @@ x-pack/plugins/maps @elastic/kibana-gis
x-pack/packages/maps/vector_tile_utils @elastic/kibana-gis
x-pack/packages/ml/agg_utils @elastic/ml-ui
x-pack/packages/ml/anomaly_utils @elastic/ml-ui
x-pack/packages/ml/category_validator @elastic/ml-ui
x-pack/packages/ml/data_frame_analytics_utils @elastic/ml-ui
x-pack/packages/ml/data_grid @elastic/ml-ui
x-pack/packages/ml/date_picker @elastic/ml-ui
Expand Down
1 change: 1 addition & 0 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -500,6 +500,7 @@
"@kbn/maps-vector-tile-utils": "link:x-pack/packages/maps/vector_tile_utils",
"@kbn/ml-agg-utils": "link:x-pack/packages/ml/agg_utils",
"@kbn/ml-anomaly-utils": "link:x-pack/packages/ml/anomaly_utils",
"@kbn/ml-category-validator": "link:x-pack/packages/ml/category_validator",
"@kbn/ml-data-frame-analytics-utils": "link:x-pack/packages/ml/data_frame_analytics_utils",
"@kbn/ml-data-grid": "link:x-pack/packages/ml/data_grid",
"@kbn/ml-date-picker": "link:x-pack/packages/ml/date_picker",
Expand Down
2 changes: 2 additions & 0 deletions tsconfig.base.json
Original file line number Diff line number Diff line change
Expand Up @@ -962,6 +962,8 @@
"@kbn/ml-agg-utils/*": ["x-pack/packages/ml/agg_utils/*"],
"@kbn/ml-anomaly-utils": ["x-pack/packages/ml/anomaly_utils"],
"@kbn/ml-anomaly-utils/*": ["x-pack/packages/ml/anomaly_utils/*"],
"@kbn/ml-category-validator": ["x-pack/packages/ml/category_validator"],
"@kbn/ml-category-validator/*": ["x-pack/packages/ml/category_validator/*"],
"@kbn/ml-data-frame-analytics-utils": ["x-pack/packages/ml/data_frame_analytics_utils"],
"@kbn/ml-data-frame-analytics-utils/*": ["x-pack/packages/ml/data_frame_analytics_utils/*"],
"@kbn/ml-data-grid": ["x-pack/packages/ml/data_grid"],
Expand Down
1 change: 1 addition & 0 deletions x-pack/.i18nrc.json
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@
"packages/ml/data_grid",
"packages/ml/date_picker",
"packages/ml/trained_models_utils",
"packages/ml/category_validator",
"plugins/ml"
],
"xpack.monitoring": ["plugins/monitoring"],
Expand Down
3 changes: 3 additions & 0 deletions x-pack/packages/ml/category_validator/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# @kbn/ml-category-validator

Provides functions for validating data to see whether it is suitable for categorization
Original file line number Diff line number Diff line change
Expand Up @@ -6,76 +6,136 @@
*/

import { i18n } from '@kbn/i18n';
import { VALIDATION_RESULT } from '../types/categories';

export const NUMBER_OF_CATEGORY_EXAMPLES = 5;
/**
* The number of category examples to use for analysis.
*/
export const CATEGORY_EXAMPLES_SAMPLE_SIZE = 1000;

/**
* The warning limit for category examples. If the category examples validation falls below this limit, a warning is triggered.
*/
export const CATEGORY_EXAMPLES_WARNING_LIMIT = 0.75;

/**
* The error limit for category examples. If the category examples validation falls below this limit, an error is triggered.
*/
export const CATEGORY_EXAMPLES_ERROR_LIMIT = 0.02;

/**
* The valid token count for category examples.
*/
export const VALID_TOKEN_COUNT = 3;

/**
* The limit for the median line length of category examples.
*/
export const MEDIAN_LINE_LENGTH_LIMIT = 400;

/**
* The limit for the percentage of null values in category examples.
*/
export const NULL_COUNT_PERCENT_LIMIT = 0.75;

/**
* Enum representing the validation status of category examples.
*/
export enum CATEGORY_EXAMPLES_VALIDATION_STATUS {
VALID = 'valid',
PARTIALLY_VALID = 'partially_valid',
INVALID = 'invalid',
}

/**
* Enum representing the validation results for field examples.
*/
export enum VALIDATION_RESULT {
NO_EXAMPLES,
FAILED_TO_TOKENIZE,
TOO_MANY_TOKENS,
TOKEN_COUNT,
MEDIAN_LINE_LENGTH,
NULL_VALUES,
INSUFFICIENT_PRIVILEGES,
}

/**
* Description for each validation result.
*/
export const VALIDATION_CHECK_DESCRIPTION = {
/**
* Examples were successfully loaded.
*/
[VALIDATION_RESULT.NO_EXAMPLES]: i18n.translate(
'xpack.ml.models.jobService.categorization.messages.validNoDataFound',
{
defaultMessage: 'Examples were successfully loaded.',
defaultMessage: 'Examples were successfully loaded.',
}
),
) as string,
/**
* The loaded examples were tokenized successfully.
*/
[VALIDATION_RESULT.FAILED_TO_TOKENIZE]: i18n.translate(
'xpack.ml.models.jobService.categorization.messages.validFailureToGetTokens',
{
defaultMessage: 'The examples loaded were tokenized successfully.',
defaultMessage: 'The loaded examples were tokenized successfully.',
}
),
) as string,
/**
* More than {tokenCount} tokens per example were found in over {percentage}% of the loaded examples.
*/
[VALIDATION_RESULT.TOKEN_COUNT]: i18n.translate(
'xpack.ml.models.jobService.categorization.messages.validTokenLength',
{
defaultMessage:
'More than {tokenCount} tokens per example were found in over {percentage}% of the examples loaded.',
'More than {tokenCount} tokens per example were found in over {percentage}% of the loaded examples.',
values: {
percentage: Math.floor(CATEGORY_EXAMPLES_WARNING_LIMIT * 100),
tokenCount: VALID_TOKEN_COUNT,
},
}
),
) as string,
/**
* The median line length of the loaded examples was less than {medianCharCount} characters.
*/
[VALIDATION_RESULT.MEDIAN_LINE_LENGTH]: i18n.translate(
'xpack.ml.models.jobService.categorization.messages.validMedianLineLength',
{
defaultMessage:
'The median line length of the examples loaded was less than {medianCharCount} characters.',
'The median line length of the loaded examples was less than {medianCharCount} characters.',
values: {
medianCharCount: MEDIAN_LINE_LENGTH_LIMIT,
},
}
),
) as string,
/**
* Less than {percentage}% of the loaded examples were null.
*/
[VALIDATION_RESULT.NULL_VALUES]: i18n.translate(
'xpack.ml.models.jobService.categorization.messages.validNullValues',
{
defaultMessage: 'Less than {percentage}% of the examples loaded were null.',
defaultMessage: 'Less than {percentage}% of the loaded examples were null.',
values: {
percentage: Math.floor(100 - NULL_COUNT_PERCENT_LIMIT * 100),
},
}
),
) as string,
/**
* Less than 10000 tokens were found in total in the loaded examples.
*/
[VALIDATION_RESULT.TOO_MANY_TOKENS]: i18n.translate(
'xpack.ml.models.jobService.categorization.messages.validTooManyTokens',
{
defaultMessage: 'Less than 10000 tokens were found in total in the examples loaded.',
defaultMessage: 'Less than 10000 tokens were found in total in the loaded examples.',
}
),
) as string,
/**
* The user has sufficient privileges to perform the checks.
*/
[VALIDATION_RESULT.INSUFFICIENT_PRIVILEGES]: i18n.translate(
'xpack.ml.models.jobService.categorization.messages.validUserPrivileges',
{
defaultMessage: 'The user has sufficient privileges to perform the checks.',
}
),
) as string,
};
81 changes: 81 additions & 0 deletions x-pack/packages/ml/category_validator/common/types/categories.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License
* 2.0; you may not use this file except in compliance with the Elastic License
* 2.0.
*/

import type * as estypes from '@elastic/elasticsearch/lib/api/typesWithBodyKey';

import {
CATEGORY_EXAMPLES_VALIDATION_STATUS,
VALIDATION_RESULT,
} from '../constants/categorization';

/**
* Token
*/
export interface Token {
/**
* The token string.
*/
token: string;
/**
* The starting offset of the token.
*/
start_offset: number;
/**
* The ending offset of the token.
*/
end_offset: number;
/**
* The type of the token.
*/
type: string;
/**
* The position of the token.
*/
position: number;
}

/**
* Categorization analyzer with additional properties.
*/
export type CategorizationAnalyzer = estypes.MlCategorizationAnalyzerDefinition & {
/**
* The analyzer used for categorization.
*/
analyzer?: string;
};

/**
* Field example for a category.
*/
export interface CategoryFieldExample {
/**
* The text of the field example.
*/
text: string;
/**
* The tokens extracted from the field example.
*/
tokens: Token[];
}

/**
* Result of a field example check.
*/
export interface FieldExampleCheck {
/**
* The ID of the validation result.
*/
id: VALIDATION_RESULT;
/**
* The validation status of the field example.
*/
valid: CATEGORY_EXAMPLES_VALIDATION_STATUS;
/**
* The message associated with the validation result.
*/
message: string;
}
25 changes: 25 additions & 0 deletions x-pack/packages/ml/category_validator/index.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License
* 2.0; you may not use this file except in compliance with the Elastic License
* 2.0.
*/

export { categorizationExamplesProvider } from './src/examples';
export type {
CategorizationAnalyzer,
CategoryFieldExample,
FieldExampleCheck,
Token,
} from './common/types/categories';
export {
CATEGORY_EXAMPLES_ERROR_LIMIT,
CATEGORY_EXAMPLES_SAMPLE_SIZE,
CATEGORY_EXAMPLES_VALIDATION_STATUS,
CATEGORY_EXAMPLES_WARNING_LIMIT,
MEDIAN_LINE_LENGTH_LIMIT,
NULL_COUNT_PERCENT_LIMIT,
VALID_TOKEN_COUNT,
VALIDATION_CHECK_DESCRIPTION,
VALIDATION_RESULT,
} from './common/constants/categorization';
12 changes: 12 additions & 0 deletions x-pack/packages/ml/category_validator/jest.config.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License
* 2.0; you may not use this file except in compliance with the Elastic License
* 2.0.
*/

module.exports = {
preset: '@kbn/test/jest_node',
rootDir: '../../../..',
roots: ['<rootDir>/x-pack/packages/ml/category_validator'],
};
5 changes: 5 additions & 0 deletions x-pack/packages/ml/category_validator/kibana.jsonc
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
{
"type": "shared-common",
"id": "@kbn/ml-category-validator",
"owner": "@elastic/ml-ui"
}
6 changes: 6 additions & 0 deletions x-pack/packages/ml/category_validator/package.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
{
"name": "@kbn/ml-category-validator",
"private": true,
"version": "1.0.0",
"license": "Elastic License 2.0"
}

0 comments on commit 219426d

Please sign in to comment.