Skip to content

Commit

Permalink
[ML] Fix Kibana date format and similar overrides in text structure e…
Browse files Browse the repository at this point in the history
…ndpoint (#84967) (#84973)

This PR fixes two problems. The first caused the second to be noticed:

1. The timestamp format pattern for Kibana's default date format was wrong - it
   was designed to match double digit day of month only, when Kibana uses a
   single digit for 1-9.
2. Trying to use a timestamp format override to work around the first problem
   didn't work. This exposed a second, more general problem. When a timestamp
   format override is supplied we try to work out if a built in timestamp
   format would always include it, and use that in preference. (This allows
   friendly format names like "iso8601" to replace complex patterns.) However,
   the way this ability to substitute was being determined was flawed, leading
   to incorrect substitutions. In the case of the Kibana timestamp format it
   resulted in the correct format supplied as an override being replaced with
   the buggy built in timestamp format. To fix this, substitutions are now
   only performed if the correct number of digits are present as well as the
   Grok pattern matching.
  • Loading branch information
droberts195 committed Mar 15, 2022
1 parent dc3d3b5 commit 8190962
Show file tree
Hide file tree
Showing 3 changed files with 43 additions and 16 deletions.
5 changes: 5 additions & 0 deletions docs/changelog/84967.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
pr: 84967
summary: Fix Kibana date format and similar overrides in text structure endpoint
area: Machine Learning
type: bug
issues: []
Original file line number Diff line number Diff line change
Expand Up @@ -278,11 +278,11 @@ public final class TimestampFormatFinder {
),
// The Kibana export format
new CandidateTimestampFormat(
example -> Collections.singletonList("MMM dd, yyyy @ HH:mm:ss.SSS"),
"\\b[A-Z]\\S{2} \\d{2}, \\d{4} @ \\d{2}:\\d{2}:\\d{2}\\.\\d{3}\\b",
example -> Collections.singletonList("MMM d, yyyy @ HH:mm:ss.SSS"),
"\\b[A-Z]\\S{2} \\d{1,2}, \\d{4} @ \\d{2}:\\d{2}:\\d{2}\\.\\d{3}\\b",
"\\b%{MONTH} %{MONTHDAY}, %{YEAR} @ %{HOUR}:%{MINUTE}:%{SECOND}\\b",
CUSTOM_TIMESTAMP_GROK_NAME,
" 11 1111 11 11 11 111",
Arrays.asList(" 11 1111 11 11 11 111", " 1 1111 11 11 11 111"),
0,
0
)
Expand Down Expand Up @@ -475,9 +475,10 @@ static CandidateTimestampFormat makeCandidateFromOverrideFormat(String overrideF
// Additionally it has the full 9 digits of fractional second precision, to avoid the possibility of truncating the fraction.
String generatedTimestamp = javaTimeFormatter.withZone(ZoneOffset.ofHoursMinutesSeconds(5, 45, 0))
.format(Instant.ofEpochMilli(981173106123L).plusNanos(456789L));
BitSet numberPosBitSet = stringToNumberPosBitSet(generatedTimestamp);
for (CandidateTimestampFormat candidate : ORDERED_CANDIDATE_FORMATS) {

TimestampMatch match = checkCandidate(candidate, generatedTimestamp, null, true, timeoutChecker);
TimestampMatch match = checkCandidate(candidate, generatedTimestamp, numberPosBitSet, true, timeoutChecker);
if (match != null) {
return new CandidateTimestampFormat(example -> {

Expand Down Expand Up @@ -522,20 +523,27 @@ private static TimestampMatch checkCandidate(
boolean requireFullMatch,
TimeoutChecker timeoutChecker
) {
Tuple<Integer, Integer> boundsForCandidate = findBoundsForCandidate(candidate, numberPosBitSet);
if (requireFullMatch) {
Map<String, Object> captures = timeoutChecker.grokCaptures(
candidate.strictFullMatchGrok,
text,
"timestamp format determination"
);
if (captures != null) {
return new TimestampMatch(candidate, "", text, "");
// Even though the "strict" Grok pattern should only match text that can be parsed using
// the corresponding time format, the built in Grok patterns are not as strict as they
// could be. Therefore, enforce that the bit pattern also matches, as this will rule out
// problems like the %{MONTHDAY} Grok pattern matching both single and double digit days
// while the date format only matches double digit days.
if (boundsForCandidate.v1() == 0) {
Map<String, Object> captures = timeoutChecker.grokCaptures(
candidate.strictFullMatchGrok,
text,
"timestamp format determination"
);
if (captures != null) {
return new TimestampMatch(candidate, "", text, "");
}
}
} else {
// Since a search in a long string that has sections that nearly match will be very slow, it's
// worth doing an initial sanity check to see if the relative positions of digits necessary to
// get a match exist first
Tuple<Integer, Integer> boundsForCandidate = findBoundsForCandidate(candidate, numberPosBitSet);
// get a match exist first.
if (boundsForCandidate.v1() >= 0) {
assert boundsForCandidate.v2() > boundsForCandidate.v1();
String matchIn = text.substring(boundsForCandidate.v1(), Math.min(boundsForCandidate.v2(), text.length()));
Expand Down Expand Up @@ -582,7 +590,7 @@ private static TimestampMatch checkCandidate(
*/
public void addSample(String text) {

BitSet numberPosBitSet = requireFullMatch ? null : stringToNumberPosBitSet(text);
BitSet numberPosBitSet = stringToNumberPosBitSet(text);

for (CandidateTimestampFormat candidate : orderedCandidateFormats) {

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1179,8 +1179,8 @@ public void testFindFormatGivenOnlyKnownTimestampFormat() {
validateTimestampMatch(
"May 15, 2018 @ 17:14:56.374",
"CUSTOM_TIMESTAMP",
"\\b[A-Z]\\S{2} \\d{2}, \\d{4} @ \\d{2}:\\d{2}:\\d{2}\\.\\d{3}\\b",
"MMM dd, yyyy @ HH:mm:ss.SSS",
"\\b[A-Z]\\S{2} \\d{1,2}, \\d{4} @ \\d{2}:\\d{2}:\\d{2}\\.\\d{3}\\b",
"MMM d, yyyy @ HH:mm:ss.SSS",
1526400896374L
);
}
Expand Down Expand Up @@ -1272,6 +1272,20 @@ public void testCustomOverridesNotMatchingBuiltInFormat() {
"%{MONTHDAY}\\.%{MONTHNUM2}\\. %{YEAR} %{HOUR}:%{MINUTE}:%{SECOND}"
)
);

validateCustomOverrideNotMatchingBuiltInFormat(
// This pattern is very close to HTTPDERROR_DATE, differing only because it contains a "d" instead of a "dd".
// This test therefore proves that we don't decide that this override can be replaced with the built in
// HTTPDERROR_DATE format, but do preserve it as a custom format.
"EEE MMM d HH:mm:ss yyyy",
"Mon Mar 7 15:03:23 2022",
"\\b[A-Z]\\S{2} [A-Z]\\S{2} \\d{1,2} \\d{2}:\\d{2}:\\d{2} \\d{4}\\b",
"CUSTOM_TIMESTAMP",
Collections.singletonMap(
TimestampFormatFinder.CUSTOM_TIMESTAMP_GROK_NAME,
"%{DAY} %{MONTH} %{MONTHDAY} %{HOUR}:%{MINUTE}:%{SECOND} %{YEAR}"
)
);
}

private void validateCustomOverrideNotMatchingBuiltInFormat(
Expand Down

0 comments on commit 8190962

Please sign in to comment.