Skip to content

Commit

Permalink
poc
Browse files Browse the repository at this point in the history
  • Loading branch information
walterra committed Aug 29, 2023
1 parent d6db6b7 commit 1fd0edc
Show file tree
Hide file tree
Showing 25 changed files with 1,709 additions and 1,254 deletions.
1 change: 1 addition & 0 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -509,6 +509,7 @@
"@kbn/ml-agg-utils": "link:x-pack/packages/ml/agg_utils",
"@kbn/ml-anomaly-utils": "link:x-pack/packages/ml/anomaly_utils",
"@kbn/ml-category-validator": "link:x-pack/packages/ml/category_validator",
"@kbn/ml-chi2test": "link:x-pack/packages/ml/chi2test",
"@kbn/ml-data-frame-analytics-utils": "link:x-pack/packages/ml/data_frame_analytics_utils",
"@kbn/ml-data-grid": "link:x-pack/packages/ml/data_grid",
"@kbn/ml-date-picker": "link:x-pack/packages/ml/date_picker",
Expand Down
2 changes: 2 additions & 0 deletions tsconfig.base.json
Original file line number Diff line number Diff line change
Expand Up @@ -976,6 +976,8 @@
"@kbn/ml-anomaly-utils/*": ["x-pack/packages/ml/anomaly_utils/*"],
"@kbn/ml-category-validator": ["x-pack/packages/ml/category_validator"],
"@kbn/ml-category-validator/*": ["x-pack/packages/ml/category_validator/*"],
"@kbn/ml-chi2test": ["x-pack/packages/ml/chi2test"],
"@kbn/ml-chi2test/*": ["x-pack/packages/ml/chi2test/*"],
"@kbn/ml-data-frame-analytics-utils": ["x-pack/packages/ml/data_frame_analytics_utils"],
"@kbn/ml-data-frame-analytics-utils/*": ["x-pack/packages/ml/data_frame_analytics_utils/*"],
"@kbn/ml-data-grid": ["x-pack/packages/ml/data_grid"],
Expand Down
3 changes: 3 additions & 0 deletions x-pack/packages/ml/chi2test/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# @kbn/ml-chi2test

Empty package generated by @kbn/generate
45 changes: 45 additions & 0 deletions x-pack/packages/ml/chi2test/compute_chi_2_pvalue.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License
* 2.0; you may not use this file except in compliance with the Elastic License
* 2.0.
*/

import { criticalTableLookup } from './critical_table_lookup';
import type { Histogram } from './types';

/**
* Compute the p-value for how similar the datasets are.
* Returned value ranges from 0 to 1, with 1 meaning the datasets are identical.
* @param normalizedBaselineTerms
* @param normalizedDriftedTerms
*/
export const computeChi2PValue = (
normalizedBaselineTerms: Histogram[],
normalizedDriftedTerms: Histogram[]
) => {
// Get all unique keys from both arrays
const allKeys: string[] = Array.from(
new Set([
...normalizedBaselineTerms.map((term) => term.key.toString()),
...normalizedDriftedTerms.map((term) => term.key.toString()),
])
).slice(0, 100);

// Calculate the chi-squared statistic and degrees of freedom
let chiSquared: number = 0;
const degreesOfFreedom: number = allKeys.length - 1;

if (degreesOfFreedom === 0) return 1;

allKeys.forEach((key) => {
const baselineTerm = normalizedBaselineTerms.find((term) => term.key === key);
const driftedTerm = normalizedDriftedTerms.find((term) => term.key === key);

const observed: number = driftedTerm?.percentage ?? 0;
const expected: number = baselineTerm?.percentage ?? 0;
chiSquared += Math.pow(observed - expected, 2) / (expected > 0 ? expected : 1e-6); // Prevent divide by zero
});

return criticalTableLookup(chiSquared, degreesOfFreedom);
};
1,015 changes: 1,015 additions & 0 deletions x-pack/packages/ml/chi2test/constants.ts

Large diffs are not rendered by default.

30 changes: 30 additions & 0 deletions x-pack/packages/ml/chi2test/critical_table_lookup.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License
* 2.0; you may not use this file except in compliance with the Elastic License
* 2.0.
*/

import { CRITICAL_VALUES_TABLE, SIGNIFICANCE_LEVELS } from './constants';

export const criticalTableLookup = (chi2Statistic: number, df: number) => {
if (df < 1) return 1;
if (!Number.isInteger(df)) throw Error('Degrees of freedom must be a valid integer');

// Get the row index
const rowIndex: number = df - 1;

// Get the column index
let minDiff: number = Math.abs(CRITICAL_VALUES_TABLE[rowIndex][0] - chi2Statistic);
let columnIndex: number = 0;
for (let j = 1; j < CRITICAL_VALUES_TABLE[rowIndex].length; j++) {
const diff: number = Math.abs(CRITICAL_VALUES_TABLE[rowIndex][j] - chi2Statistic);
if (diff < minDiff) {
minDiff = diff;
columnIndex = j;
}
}

const significanceLevel: number = SIGNIFICANCE_LEVELS[columnIndex];
return significanceLevel;
};
11 changes: 11 additions & 0 deletions x-pack/packages/ml/chi2test/index.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License
* 2.0; you may not use this file except in compliance with the Elastic License
* 2.0.
*/

export { computeChi2PValue } from './compute_chi_2_pvalue';
export { criticalTableLookup } from './critical_table_lookup';
export { CRITICAL_VALUES_TABLE, SIGNIFICANCE_LEVELS } from './constants';
export type { Histogram } from './types';
12 changes: 12 additions & 0 deletions x-pack/packages/ml/chi2test/jest.config.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License
* 2.0; you may not use this file except in compliance with the Elastic License
* 2.0.
*/

module.exports = {
preset: '@kbn/test',
rootDir: '../../../..',
roots: ['<rootDir>/x-pack/packages/ml/chi2test'],
};
5 changes: 5 additions & 0 deletions x-pack/packages/ml/chi2test/kibana.jsonc
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
{
"type": "shared-common",
"id": "@kbn/ml-chi2test",
"owner": "@elastic/ml-ui"
}
6 changes: 6 additions & 0 deletions x-pack/packages/ml/chi2test/package.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
{
"name": "@kbn/ml-chi2test",
"private": true,
"version": "1.0.0",
"license": "SSPL-1.0 OR Elastic License 2.0"
}
19 changes: 19 additions & 0 deletions x-pack/packages/ml/chi2test/tsconfig.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
{
"extends": "../../../../tsconfig.base.json",
"compilerOptions": {
"outDir": "target/types",
"types": [
"jest",
"node",
"react"
]
},
"include": [
"**/*.ts",
"**/*.tsx",
],
"exclude": [
"target/**/*"
],
"kbn_references": []
}
12 changes: 12 additions & 0 deletions x-pack/packages/ml/chi2test/types.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License
* 2.0; you may not use this file except in compliance with the Elastic License
* 2.0.
*/

export interface Histogram {
doc_count: number;
key: string | number;
percentage?: number;
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License
* 2.0; you may not use this file except in compliance with the Elastic License
* 2.0.
*/

import { cloneDeep } from 'lodash';

import type { QueryDslQueryContainer } from '@elastic/elasticsearch/lib/api/types';

export function createCategorizeQuery(
queryIn: QueryDslQueryContainer,
timeField: string,
from: number | undefined,
to: number | undefined
) {
const query = cloneDeep(queryIn);

if (query.bool === undefined) {
query.bool = {};
}
if (query.bool.must === undefined) {
query.bool.must = [];
if (query.match_all !== undefined) {
query.bool.must.push({ match_all: query.match_all });
delete query.match_all;
}
}
if (query.multi_match !== undefined) {
query.bool.should = {
multi_match: query.multi_match,
};
delete query.multi_match;
}

(query.bool.must as QueryDslQueryContainer[]).push({
range: {
[timeField]: {
gte: from,
lte: to,
format: 'epoch_millis',
},
},
});

return query;
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License
* 2.0; you may not use this file except in compliance with the Elastic License
* 2.0.
*/

import type { QueryDslQueryContainer } from '@elastic/elasticsearch/lib/api/types';

import { createRandomSamplerWrapper } from '@kbn/ml-random-sampler-utils';

import { createCategorizeQuery } from './create_categorize_query';

const CATEGORY_LIMIT = 1000;
const EXAMPLE_LIMIT = 1;

export function createCategoryRequest(
index: string,
field: string,
timeField: string,
from: number | undefined,
to: number | undefined,
queryIn: QueryDslQueryContainer,
wrap: ReturnType<typeof createRandomSamplerWrapper>['wrap'],
intervalMs?: number
) {
const query = createCategorizeQuery(queryIn, timeField, from, to);
const aggs = {
categories: {
categorize_text: {
field,
size: CATEGORY_LIMIT,
},
aggs: {
hit: {
top_hits: {
size: EXAMPLE_LIMIT,
sort: [timeField],
_source: field,
},
},
...(intervalMs
? {
sparkline: {
date_histogram: {
field: timeField,
fixed_interval: `${intervalMs}ms`,
},
},
}
: {}),
},
},
};

return {
params: {
index,
size: 0,
body: {
query,
aggs: wrap(aggs),
},
},
};
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License
* 2.0; you may not use this file except in compliance with the Elastic License
* 2.0.
*/

import { get } from 'lodash';

import { estypes } from '@elastic/elasticsearch';

import { createRandomSamplerWrapper } from '@kbn/ml-random-sampler-utils';

export interface Category {
key: string;
count: number;
examples: string[];
sparkline?: Array<{ doc_count: number; key: number; key_as_string: string }>;
}

export interface CategoriesAgg {
categories: {
buckets: Array<{
key: string;
doc_count: number;
hit: { hits: { hits: Array<{ _source: { message: string } }> } };
sparkline: {
buckets: Array<{ key_as_string: string; key: number; doc_count: number }>;
};
}>;
};
}

interface CategoriesSampleAgg {
sample: CategoriesAgg;
}

export interface CatResponse {
rawResponse: estypes.SearchResponseBody<unknown, CategoriesAgg | CategoriesSampleAgg>;
}

export type SparkLinesPerCategory = Record<string, Record<number, number>>;

export function processCategoryResults(
result: CatResponse,
field: string,
unwrap: ReturnType<typeof createRandomSamplerWrapper>['unwrap']
) {
const sparkLinesPerCategory: SparkLinesPerCategory = {};
const { aggregations } = result.rawResponse;
if (aggregations === undefined) {
throw new Error('processCategoryResults failed, did not return aggregations.');
}
const {
categories: { buckets },
} = unwrap(
aggregations as unknown as Record<string, estypes.AggregationsAggregate>
) as CategoriesAgg;

const categories: Category[] = buckets.map((b) => {
sparkLinesPerCategory[b.key] =
b.sparkline === undefined
? {}
: b.sparkline.buckets.reduce<Record<number, number>>((acc2, cur2) => {
acc2[cur2.key] = cur2.doc_count;
return acc2;
}, {});

return {
key: b.key,
count: b.doc_count,
examples: b.hit.hits.hits.map((h) => get(h._source, field)),
};
});
return {
categories,
sparkLinesPerCategory,
};
}
Original file line number Diff line number Diff line change
Expand Up @@ -24,9 +24,14 @@ import { Filter, Query } from '@kbn/es-query';
import { i18n } from '@kbn/i18n';
import { FormattedMessage } from '@kbn/i18n-react';
import { usePageUrlState, useUrlState } from '@kbn/ml-url-state';

import type { FieldValidationResults } from '@kbn/ml-category-validator';
import type { SearchQueryLanguage } from '@kbn/ml-query-utils';

import type {
Category,
SparkLinesPerCategory,
} from '../../../common/api/log_categorization/process_category_results';

import { useDataSource } from '../../hooks/use_data_source';
import { useData } from '../../hooks/use_data';
import { useSearch } from '../../hooks/use_search';
Expand All @@ -39,7 +44,7 @@ import {
import { SearchPanel } from '../search_panel';
import { PageHeader } from '../page_header';

import type { EventRate, Category, SparkLinesPerCategory } from './use_categorize_request';
import type { EventRate } from './use_categorize_request';
import { useCategorizeRequest } from './use_categorize_request';
import { CategoryTable } from './category_table';
import { DocumentCountChart } from './document_count_chart';
Expand Down
Loading

0 comments on commit 1fd0edc

Please sign in to comment.