Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[ML] Creating category validation package #161261

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
19 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .github/CODEOWNERS
Validating CODEOWNERS rules …
Original file line number Diff line number Diff line change
Expand Up @@ -483,6 +483,7 @@ x-pack/plugins/maps @elastic/kibana-gis
x-pack/packages/maps/vector_tile_utils @elastic/kibana-gis
x-pack/packages/ml/agg_utils @elastic/ml-ui
x-pack/packages/ml/anomaly_utils @elastic/ml-ui
x-pack/packages/ml/category_validator @elastic/ml-ui
x-pack/packages/ml/data_frame_analytics_utils @elastic/ml-ui
x-pack/packages/ml/data_grid @elastic/ml-ui
x-pack/packages/ml/date_picker @elastic/ml-ui
Expand Down
1 change: 1 addition & 0 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -499,6 +499,7 @@
"@kbn/maps-vector-tile-utils": "link:x-pack/packages/maps/vector_tile_utils",
"@kbn/ml-agg-utils": "link:x-pack/packages/ml/agg_utils",
"@kbn/ml-anomaly-utils": "link:x-pack/packages/ml/anomaly_utils",
"@kbn/ml-category-validator": "link:x-pack/packages/ml/category_validator",
"@kbn/ml-data-frame-analytics-utils": "link:x-pack/packages/ml/data_frame_analytics_utils",
"@kbn/ml-data-grid": "link:x-pack/packages/ml/data_grid",
"@kbn/ml-date-picker": "link:x-pack/packages/ml/date_picker",
Expand Down
2 changes: 2 additions & 0 deletions tsconfig.base.json
Original file line number Diff line number Diff line change
Expand Up @@ -960,6 +960,8 @@
"@kbn/ml-agg-utils/*": ["x-pack/packages/ml/agg_utils/*"],
"@kbn/ml-anomaly-utils": ["x-pack/packages/ml/anomaly_utils"],
"@kbn/ml-anomaly-utils/*": ["x-pack/packages/ml/anomaly_utils/*"],
"@kbn/ml-category-validator": ["x-pack/packages/ml/category_validator"],
"@kbn/ml-category-validator/*": ["x-pack/packages/ml/category_validator/*"],
"@kbn/ml-data-frame-analytics-utils": ["x-pack/packages/ml/data_frame_analytics_utils"],
"@kbn/ml-data-frame-analytics-utils/*": ["x-pack/packages/ml/data_frame_analytics_utils/*"],
"@kbn/ml-data-grid": ["x-pack/packages/ml/data_grid"],
Expand Down
1 change: 1 addition & 0 deletions x-pack/.i18nrc.json
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@
"packages/ml/data_grid",
"packages/ml/date_picker",
"packages/ml/trained_models_utils",
"packages/ml/category_validator",
"plugins/ml"
],
"xpack.monitoring": ["plugins/monitoring"],
Expand Down
3 changes: 3 additions & 0 deletions x-pack/packages/ml/category_validator/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# @kbn/ml-category-validator

Provides functions for validating data to see whether it is suitable for categorization
Original file line number Diff line number Diff line change
Expand Up @@ -6,76 +6,136 @@
*/

import { i18n } from '@kbn/i18n';
import { VALIDATION_RESULT } from '../types/categories';

export const NUMBER_OF_CATEGORY_EXAMPLES = 5;
/**
* The number of category examples to use for analysis.
*/
export const CATEGORY_EXAMPLES_SAMPLE_SIZE = 1000;

/**
* The warning limit for category examples. If the category examples validation falls below this limit, a warning is triggered.
*/
export const CATEGORY_EXAMPLES_WARNING_LIMIT = 0.75;

/**
* The error limit for category examples. If the category examples validation falls below this limit, an error is triggered.
*/
export const CATEGORY_EXAMPLES_ERROR_LIMIT = 0.02;

/**
* The valid token count for category examples.
*/
export const VALID_TOKEN_COUNT = 3;

/**
* The limit for the median line length of category examples.
*/
export const MEDIAN_LINE_LENGTH_LIMIT = 400;

/**
* The limit for the percentage of null values in category examples.
*/
export const NULL_COUNT_PERCENT_LIMIT = 0.75;

/**
* Enum representing the validation status of category examples.
*/
export enum CATEGORY_EXAMPLES_VALIDATION_STATUS {
VALID = 'valid',
PARTIALLY_VALID = 'partially_valid',
INVALID = 'invalid',
}

/**
* Enum representing the validation results for field examples.
*/
export enum VALIDATION_RESULT {
NO_EXAMPLES,
FAILED_TO_TOKENIZE,
TOO_MANY_TOKENS,
TOKEN_COUNT,
MEDIAN_LINE_LENGTH,
NULL_VALUES,
INSUFFICIENT_PRIVILEGES,
}

/**
* Description for each validation result.
*/
export const VALIDATION_CHECK_DESCRIPTION = {
/**
* Examples were successfully loaded.
*/
[VALIDATION_RESULT.NO_EXAMPLES]: i18n.translate(
'xpack.ml.models.jobService.categorization.messages.validNoDataFound',
{
defaultMessage: 'Examples were successfully loaded.',
defaultMessage: 'Examples were successfully loaded.',
}
),
) as string,
/**
* The loaded examples were tokenized successfully.
*/
[VALIDATION_RESULT.FAILED_TO_TOKENIZE]: i18n.translate(
'xpack.ml.models.jobService.categorization.messages.validFailureToGetTokens',
{
defaultMessage: 'The examples loaded were tokenized successfully.',
defaultMessage: 'The loaded examples were tokenized successfully.',
}
),
) as string,
/**
* More than {tokenCount} tokens per example were found in over {percentage}% of the loaded examples.
*/
[VALIDATION_RESULT.TOKEN_COUNT]: i18n.translate(
'xpack.ml.models.jobService.categorization.messages.validTokenLength',
{
defaultMessage:
'More than {tokenCount} tokens per example were found in over {percentage}% of the examples loaded.',
'More than {tokenCount} tokens per example were found in over {percentage}% of the loaded examples.',
values: {
percentage: Math.floor(CATEGORY_EXAMPLES_WARNING_LIMIT * 100),
tokenCount: VALID_TOKEN_COUNT,
},
}
),
) as string,
/**
* The median line length of the loaded examples was less than {medianCharCount} characters.
*/
[VALIDATION_RESULT.MEDIAN_LINE_LENGTH]: i18n.translate(
'xpack.ml.models.jobService.categorization.messages.validMedianLineLength',
{
defaultMessage:
'The median line length of the examples loaded was less than {medianCharCount} characters.',
'The median line length of the loaded examples was less than {medianCharCount} characters.',
values: {
medianCharCount: MEDIAN_LINE_LENGTH_LIMIT,
},
}
),
) as string,
/**
* Less than {percentage}% of the loaded examples were null.
*/
[VALIDATION_RESULT.NULL_VALUES]: i18n.translate(
'xpack.ml.models.jobService.categorization.messages.validNullValues',
{
defaultMessage: 'Less than {percentage}% of the examples loaded were null.',
defaultMessage: 'Less than {percentage}% of the loaded examples were null.',
values: {
percentage: Math.floor(100 - NULL_COUNT_PERCENT_LIMIT * 100),
},
}
),
) as string,
/**
* Less than 10000 tokens were found in total in the loaded examples.
*/
[VALIDATION_RESULT.TOO_MANY_TOKENS]: i18n.translate(
'xpack.ml.models.jobService.categorization.messages.validTooManyTokens',
{
defaultMessage: 'Less than 10000 tokens were found in total in the examples loaded.',
defaultMessage: 'Less than 10000 tokens were found in total in the loaded examples.',
}
),
) as string,
/**
* The user has sufficient privileges to perform the checks.
*/
[VALIDATION_RESULT.INSUFFICIENT_PRIVILEGES]: i18n.translate(
'xpack.ml.models.jobService.categorization.messages.validUserPrivileges',
{
defaultMessage: 'The user has sufficient privileges to perform the checks.',
}
),
) as string,
};
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License
* 2.0; you may not use this file except in compliance with the Elastic License
* 2.0.
*/

import type * as estypes from '@elastic/elasticsearch/lib/api/typesWithBodyKey';

import {
CATEGORY_EXAMPLES_VALIDATION_STATUS,
VALIDATION_RESULT,
} from '../constants/categorization';

/**
* Token
*/
export interface Token {
/**
* The token string.
*/
token: string;
/**
* The starting offset of the token.
*/
start_offset: number;
/**
* The ending offset of the token.
*/
end_offset: number;
/**
* The type of the token.
*/
type: string;
/**
* The position of the token.
*/
position: number;
}

/**
* Categorization analyzer with additional properties.
*/
export type CategorizationAnalyzer = estypes.MlCategorizationAnalyzerDefinition & {
/**
* The analyzer used for categorization.
*/
analyzer?: string;
};

/**
* Field example for a category.
*/
export interface CategoryFieldExample {
/**
* The text of the field example.
*/
text: string;
/**
* The tokens extracted from the field example.
*/
tokens: Token[];
}

/**
* Result of a field example check.
*/
export interface FieldExampleCheck {
/**
* The ID of the validation result.
*/
id: VALIDATION_RESULT;
/**
* The validation status of the field example.
*/
valid: CATEGORY_EXAMPLES_VALIDATION_STATUS;
/**
* The message associated with the validation result.
*/
message: string;
}
25 changes: 25 additions & 0 deletions x-pack/packages/ml/category_validator/index.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License
* 2.0; you may not use this file except in compliance with the Elastic License
* 2.0.
*/

export { categorizationExamplesProvider } from './src/examples';
export type {
CategorizationAnalyzer,
CategoryFieldExample,
FieldExampleCheck,
Token,
} from './common/types/categories';
export {
CATEGORY_EXAMPLES_ERROR_LIMIT,
CATEGORY_EXAMPLES_SAMPLE_SIZE,
CATEGORY_EXAMPLES_VALIDATION_STATUS,
CATEGORY_EXAMPLES_WARNING_LIMIT,
MEDIAN_LINE_LENGTH_LIMIT,
NULL_COUNT_PERCENT_LIMIT,
VALID_TOKEN_COUNT,
VALIDATION_CHECK_DESCRIPTION,
VALIDATION_RESULT,
} from './common/constants/categorization';
12 changes: 12 additions & 0 deletions x-pack/packages/ml/category_validator/jest.config.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License
* 2.0; you may not use this file except in compliance with the Elastic License
* 2.0.
*/

module.exports = {
preset: '@kbn/test/jest_node',
rootDir: '../../../..',
roots: ['<rootDir>/x-pack/packages/ml/category_validator'],
};
5 changes: 5 additions & 0 deletions x-pack/packages/ml/category_validator/kibana.jsonc
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
{
"type": "shared-common",
"id": "@kbn/ml-category-validator",
"owner": "@elastic/ml-ui"
}
6 changes: 6 additions & 0 deletions x-pack/packages/ml/category_validator/package.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
{
"name": "@kbn/ml-category-validator",
"private": true,
"version": "1.0.0",
"license": "Elastic License 2.0"
}
Loading