Skip to content

Commit

Permalink
[ML] Fixes for multi-line start patterns in text structure endpoint (#…
Browse files Browse the repository at this point in the history
…85066) (#85100)

This PR contains 3 fixes for the way multi-line start patterns are
created in the text structure endpoint:

1. For delimited files the multi-line start pattern used to be based
   on the first field that was a timestamp, boolean or number. This
   PR adds the option of a low cardinality keyword field as an
   alternative (i.e. an enum field effectively). It means there is
   more chance of a field early in each record being chosen as the
   mechanism for determining whether a line is the first line of a
   record.
2. The multi-line start pattern for delimited files now only permits
   the delimiter character between fields, not within quoted fields.
   Previously it was possible for the multi-line start pattern to
   match continuation lines. Unfortunately this may mean we can no
   longer determine a multi-line start pattern for files whose only
   suitable field is to the right of fields that sometimes contain
   commas, and the only solution in this case will be to reorder the
   columns before importing the data. Hopefully this problem will be
   very rare.
3. For semi-structured text log files there is now a cap on the
   complexity of the multi-line start pattern. It has been observed
   that the patterns generated for slightly malformed CSV files could
   run for days against the malformed lines of those files - the
   classic problem of a regex that doesn't match but nearly does doing
   lots of backtracking. We now throw an error in this situation and
   suggest overriding the format to delimited.

Relates #79708
Fixes elastic/kibana#121966
  • Loading branch information
droberts195 committed Mar 18, 2022
1 parent d0925dd commit 488cf48
Show file tree
Hide file tree
Showing 6 changed files with 378 additions and 16 deletions.
5 changes: 5 additions & 0 deletions docs/changelog/85066.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
pr: 85066
summary: Fixes for multi-line start patterns in text structure endpoint
area: Machine Learning
type: bug
issues: []
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
import java.util.Locale;
import java.util.Map;
import java.util.Random;
import java.util.Set;
import java.util.SortedMap;
import java.util.stream.Collectors;

Expand All @@ -37,6 +38,8 @@ public class DelimitedTextStructureFinder implements TextStructureFinder {
static final String REGEX_NEEDS_ESCAPE_PATTERN = "([\\\\|()\\[\\]{}^$.+*?])";
private static final int MAX_LEVENSHTEIN_COMPARISONS = 100;
private static final int LONG_FIELD_THRESHOLD = 100;
private static final int LOW_CARDINALITY_MAX_SIZE = 5;
private static final int LOW_CARDINALITY_MIN_RATIO = 3;
private final List<String> sampleMessages;
private final TextStructure structure;

Expand Down Expand Up @@ -180,11 +183,14 @@ static DelimitedTextStructureFinder makeDelimitedTextStructureFinder(
explanation,
columnNamesList,
maxLinesPerMessage,
delimiter,
delimiterPattern,
quotePattern,
fieldMappings,
sampleRecords,
timeField.v1(),
timeField.v2()
timeField.v2(),
timeoutChecker
)
);

Expand All @@ -207,11 +213,14 @@ static DelimitedTextStructureFinder makeDelimitedTextStructureFinder(
explanation,
columnNamesList,
maxLinesPerMessage,
delimiter,
delimiterPattern,
quotePattern,
fieldMappings,
sampleRecords,
null,
null
null,
timeoutChecker
)
);
}
Expand Down Expand Up @@ -744,11 +753,14 @@ static String makeMultilineStartPattern(
List<String> explanation,
List<String> columnNames,
int maxLinesPerMessage,
char delimiter,
String delimiterPattern,
String quotePattern,
Map<String, Object> fieldMappings,
List<Map<String, ?>> sampleRecords,
String timeFieldName,
TimestampFormatFinder timeFieldFormat
TimestampFormatFinder timeFieldFormat,
TimeoutChecker timeoutChecker
) {

assert columnNames.isEmpty() == false;
Expand Down Expand Up @@ -781,6 +793,7 @@ static String makeMultilineStartPattern(
case "boolean" -> "(?:true|false)";
case "byte", "short", "integer", "long" -> "[+-]?\\d+";
case "half_float", "float", "double" -> "[+-]?(?:\\d+(?:\\.\\d+)?|\\.\\d+)(?:[eE][+-]?\\d+)?";
case "keyword" -> findLowCardinalityKeywordPattern(columnName, sampleRecords, timeoutChecker);
default -> null;
};
if (columnPattern != null) {
Expand All @@ -797,7 +810,28 @@ static String makeMultilineStartPattern(
}
}
}
builder.append(".*?").append(delimiterPattern);
// We need to be strict about how many delimiters precede the chosen field,
// so if it's not the first then we cannot tolerate the preceding fields
// containing the delimiter. Additionally, there's no point choosing a field
// after a field that sometimes contains line breaks to identify the first
// line.
if (columnValueContainsDelimiterOrLineBreak(columnName, delimiter, sampleRecords, timeoutChecker)) {
throw new IllegalArgumentException(
"Cannot create a multi-line start pattern. "
+ "No suitable column to match exists before the first column whose values contain line breaks or delimiters ["
+ columnName
+ "]. If the timestamp format was not identified correctly adding an override for this may help."
);
}
builder.append("[^");
// Within a negated character class we don't want to escape special regex
// characters like dot, hence shouldn't use the pre-built pattern
if (delimiter == '\t') {
builder.append("\\t");
} else {
builder.append(delimiter);
}
builder.append("]*?").append(delimiterPattern);
}
// TODO: if this happens a lot then we should try looking for the a multi-line END pattern instead of a start pattern.
// But this would require changing the find_structure response, and the file upload UI, and would make creating Filebeat
Expand All @@ -806,6 +840,94 @@ static String makeMultilineStartPattern(
return null;
}

/**
* @return <code>true</code> if the value of the field {@code columnName} in any record in the {@code sampleRecords}
* contains the {@code delimiter} or a line break.
*/
static boolean columnValueContainsDelimiterOrLineBreak(
String columnName,
char delimiter,
List<Map<String, ?>> sampleRecords,
TimeoutChecker timeoutChecker
) {
for (Map<String, ?> sampleRecord : sampleRecords) {
timeoutChecker.check("delimiter search in multi-line start pattern determination");
Object value = sampleRecord.get(columnName);
if (value != null) {
String str = value.toString();
if (str.indexOf(delimiter) >= 0 || str.indexOf('\n') >= 0) {
return true;
}
}
}
return false;
}

/**
* Try to find a regular expression that will match any of the values of a keyword field, providing:
* 1. There are only a small number of distinct values of that keyword field
* 2. The number of sampled records is several times bigger than the number of distinct values
* 3. None of the values is empty or contains a line break
* 4. None of the values matches the last line of a value of some other field in the sampled records
* @return A regular expression that will match the small number of distinct values of the keyword field, or
* <code>null</code> if a suitable regular expression could not be found.
*/
static String findLowCardinalityKeywordPattern(String columnName, List<Map<String, ?>> sampleRecords, TimeoutChecker timeoutChecker) {

int maxCardinality = Math.min(LOW_CARDINALITY_MAX_SIZE, sampleRecords.size() / LOW_CARDINALITY_MIN_RATIO);

// Find the distinct values of the column, aborting if there are too many or if any contain newlines.
Set<String> values = new HashSet<>();
for (Map<String, ?> sampleRecord : sampleRecords) {
Object value = sampleRecord.get(columnName);
if (value == null) {
return null;
}
String str = value.toString();
if (str.isEmpty() || str.indexOf('\n') >= 0) {
return null;
}
values.add(str);
if (values.size() > maxCardinality) {
return null;
}
}

// Check that none of the values exist in other columns.
// In the case of field values that span multiple lines, it's the part after the last newline that matters.
for (Map<String, ?> sampleRecord : sampleRecords) {
timeoutChecker.check("keyword-based multi-line start pattern determination");
if (sampleRecord.entrySet()
.stream()
.anyMatch(entry -> entry.getKey().equals(columnName) == false && containsLastLine(values, entry.getValue()))) {
return null;
}
}

return values.stream()
.map(value -> value.replaceAll(REGEX_NEEDS_ESCAPE_PATTERN, "\\\\$1"))
.sorted()
.collect(Collectors.joining("|", "(?:", ")"));
}

/**
* @param set A set of strings.
* @param obj An object whose string representation may or may not contain line breaks.
* @return true if {@code set} contains the last line of {@code str} (i.e. the whole of {@code str} if it has no line breaks).
*/
static boolean containsLastLine(Set<String> set, Object obj) {
if (obj == null) {
return false;
}
String str = obj.toString();
int lastNewline = str.lastIndexOf('\n');
if (lastNewline >= 0) {
return set.contains(str.substring(lastNewline + 1));
} else {
return set.contains(str);
}
}

/**
* Make a regular expression that Filebeat can use to ignore the header line of the delimited file.
* (Such lines may be observed multiple times if multiple delimited files are concatenated.)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -391,37 +391,80 @@ private void addIntermediateRegex(Collection<String> snippets) {
addIntermediateRegex(overallGrokPatternBuilder, snippets);
}

public static void addIntermediateRegex(StringBuilder patternBuilder, Collection<String> snippets) {
/**
* Create a regular expression that matches all of a supplied collection of strings. The regular expression is chosen
* such that it explicitly matches the punctuation and whitespace that's common to all the strings, but uses wildcards
* to match other characters.
* @param patternBuilder The intermediate regular expression will be appended to this string builder.
* @param snippets The portions of all the sampled messages that the generated regular expression must match.
* @return The highest count of a particular explicit character in the generated pattern that is followed by a wildcard.
* This value gives an indication about the complexity of matching the returned pattern against a string that
* nearly matches but not quite.
*/
public static int addIntermediateRegex(StringBuilder patternBuilder, Collection<String> snippets) {
if (snippets.isEmpty()) {
return;
return 0;
}

List<String> others = new ArrayList<>(snippets);
String driver = others.remove(others.size() - 1);

char lastPunctOrSpace = '\0';
List<Character> charsPrecedingWildcard = new ArrayList<>();

boolean wildcardRequiredIfNonMatchFound = true;
for (int i = 0; i < driver.length(); ++i) {
char ch = driver.charAt(i);
Boolean punctuationOrSpaceNeedsEscaping = PUNCTUATION_OR_SPACE_NEEDS_ESCAPING.get(ch);
if (punctuationOrSpaceNeedsEscaping != null && others.stream().allMatch(other -> other.indexOf(ch) >= 0)) {
if (wildcardRequiredIfNonMatchFound && others.stream().anyMatch(other -> other.indexOf(ch) > 0)) {
patternBuilder.append(".*?");
charsPrecedingWildcard.add(lastPunctOrSpace);
}
if (punctuationOrSpaceNeedsEscaping) {
patternBuilder.append('\\');
}
patternBuilder.append(ch);
wildcardRequiredIfNonMatchFound = true;
others = others.stream().map(other -> other.substring(other.indexOf(ch) + 1)).collect(Collectors.toList());
lastPunctOrSpace = ch;
} else if (wildcardRequiredIfNonMatchFound) {
patternBuilder.append(".*?");
charsPrecedingWildcard.add(lastPunctOrSpace);
wildcardRequiredIfNonMatchFound = false;
}
}

if (wildcardRequiredIfNonMatchFound && others.stream().anyMatch(s -> s.isEmpty() == false)) {
patternBuilder.append(".*?");
}

return longestRun(charsPrecedingWildcard);
}

/**
* @return The length of the longest subsequence of identical values in {@code sequence}.
*/
static int longestRun(List<?> sequence) {
if (sequence.size() <= 1) {
return sequence.size();
}
int maxSoFar = 0;
int thisCount = 1;
for (int index = 1; index < sequence.size(); ++index) {
if (sequence.get(index).equals(sequence.get(index - 1))) {
++thisCount;
} else {
maxSoFar = Math.max(maxSoFar, thisCount);
// The next run cannot be longer if we're nearer the end than the max so far
if (maxSoFar >= sequence.size() - index) {
return maxSoFar;
}
thisCount = 1;
}
}
maxSoFar = Math.max(maxSoFar, thisCount);
return maxSoFar;
}

private void finalizeGrokPattern(Collection<String> snippets) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@

public class LogTextStructureFinder implements TextStructureFinder {

private static final int TOO_MANY_IDENTICAL_DELIMITERS_BEFORE_WILDCARDS = 8;
private final List<String> sampleMessages;
private final TextStructure structure;

Expand Down Expand Up @@ -250,11 +251,25 @@ static TimestampFormatFinder populateTimestampFormatFinder(
static String createMultiLineMessageStartRegex(Collection<String> prefaces, String simpleDateRegex) {

StringBuilder builder = new StringBuilder("^");
GrokPatternCreator.addIntermediateRegex(builder, prefaces);
int complexity = GrokPatternCreator.addIntermediateRegex(builder, prefaces);
builder.append(simpleDateRegex);
if (builder.substring(0, 3).equals("^\\b")) {
builder.delete(1, 3);
}
// This is here primarily to protect against the horrible patterns that are generated when a not-quite-valid-CSV file
// has its timestamp column near the end of each line. The algorithm used to produce the multi-line start patterns can
// then produce patterns like this:
// ^.*?,.*?,.*?,.*?,.*?,.*?,.*?,.*?,.*?,.*?,.*?,.*?,.*?,.*?,.*?,.*?,.*?,.*?,.*?,.*?,.*?,.*?,\\b\\d{4}-\\d{2}-\\d{2}[T ]\\d{2}:\\d{2}
// If a pattern like this is matched against a line that nearly matches but not quite (which is basically guaranteed in
// the not-quite-valid-CSV file case) then the backtracking will cause the match attempt to run for many days. Therefore
// it's better to just error out in this case and let the user try again with overrides.
if (complexity >= TOO_MANY_IDENTICAL_DELIMITERS_BEFORE_WILDCARDS) {
throw new IllegalArgumentException(
"Generated multi-line start pattern based on timestamp position ["
+ builder
+ "] is too complex. If your sample is delimited then try overriding the format."
);
}
return builder.toString();
}
}

0 comments on commit 488cf48

Please sign in to comment.