Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[ML] Fixes for multi-line start patterns in text structure endpoint #85066

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions docs/changelog/85066.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
pr: 85066
summary: Fixes for multi-line start patterns in text structure endpoint
area: Machine Learning
type: bug
issues: []
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
import java.util.Locale;
import java.util.Map;
import java.util.Random;
import java.util.Set;
import java.util.SortedMap;
import java.util.stream.Collectors;

Expand All @@ -37,6 +38,8 @@ public class DelimitedTextStructureFinder implements TextStructureFinder {
static final String REGEX_NEEDS_ESCAPE_PATTERN = "([\\\\|()\\[\\]{}^$.+*?])";
private static final int MAX_LEVENSHTEIN_COMPARISONS = 100;
private static final int LONG_FIELD_THRESHOLD = 100;
private static final int LOW_CARDINALITY_MAX_SIZE = 5;
private static final int LOW_CARDINALITY_MIN_RATIO = 3;
private final List<String> sampleMessages;
private final TextStructure structure;

Expand Down Expand Up @@ -180,11 +183,14 @@ static DelimitedTextStructureFinder makeDelimitedTextStructureFinder(
explanation,
columnNamesList,
maxLinesPerMessage,
delimiter,
delimiterPattern,
quotePattern,
fieldMappings,
sampleRecords,
timeField.v1(),
timeField.v2()
timeField.v2(),
timeoutChecker
)
);

Expand All @@ -207,11 +213,14 @@ static DelimitedTextStructureFinder makeDelimitedTextStructureFinder(
explanation,
columnNamesList,
maxLinesPerMessage,
delimiter,
delimiterPattern,
quotePattern,
fieldMappings,
sampleRecords,
null,
null
null,
timeoutChecker
)
);
}
Expand Down Expand Up @@ -744,11 +753,14 @@ static String makeMultilineStartPattern(
List<String> explanation,
List<String> columnNames,
int maxLinesPerMessage,
char delimiter,
String delimiterPattern,
String quotePattern,
Map<String, Object> fieldMappings,
List<Map<String, ?>> sampleRecords,
String timeFieldName,
TimestampFormatFinder timeFieldFormat
TimestampFormatFinder timeFieldFormat,
TimeoutChecker timeoutChecker
) {

assert columnNames.isEmpty() == false;
Expand Down Expand Up @@ -781,6 +793,7 @@ static String makeMultilineStartPattern(
case "boolean" -> "(?:true|false)";
case "byte", "short", "integer", "long" -> "[+-]?\\d+";
case "half_float", "float", "double" -> "[+-]?(?:\\d+(?:\\.\\d+)?|\\.\\d+)(?:[eE][+-]?\\d+)?";
case "keyword" -> findLowCardinalityKeywordPattern(columnName, sampleRecords, timeoutChecker);
default -> null;
};
if (columnPattern != null) {
Expand All @@ -797,7 +810,28 @@ static String makeMultilineStartPattern(
}
}
}
builder.append(".*?").append(delimiterPattern);
// We need to be strict about how many delimiters precede the chosen field,
// so if it's not the first then we cannot tolerate the preceding fields
// containing the delimiter. Additionally, there's no point choosing a field
// after a field that sometimes contains line breaks to identify the first
// line.
if (columnValueContainsDelimiterOrLineBreak(columnName, delimiter, sampleRecords, timeoutChecker)) {
throw new IllegalArgumentException(
"Cannot create a multi-line start pattern. "
+ "No suitable column to match exists before the first column whose values contain line breaks or delimiters ["
+ columnName
+ "]. If the timestamp format was not identified correctly adding an override for this may help."
);
}
builder.append("[^");
// Within a negated character class we don't want to escape special regex
// characters like dot, hence shouldn't use the pre-built pattern
if (delimiter == '\t') {
builder.append("\\t");
} else {
builder.append(delimiter);
}
builder.append("]*?").append(delimiterPattern);
}
// TODO: if this happens a lot then we should try looking for the a multi-line END pattern instead of a start pattern.
// But this would require changing the find_structure response, and the file upload UI, and would make creating Filebeat
Expand All @@ -806,6 +840,94 @@ static String makeMultilineStartPattern(
return null;
}

/**
* @return <code>true</code> if the value of the field {@code columnName} in any record in the {@code sampleRecords}
* contains the {@code delimiter} or a line break.
*/
static boolean columnValueContainsDelimiterOrLineBreak(
String columnName,
char delimiter,
List<Map<String, ?>> sampleRecords,
TimeoutChecker timeoutChecker
) {
for (Map<String, ?> sampleRecord : sampleRecords) {
timeoutChecker.check("delimiter search in multi-line start pattern determination");
Object value = sampleRecord.get(columnName);
if (value != null) {
String str = value.toString();
if (str.indexOf(delimiter) >= 0 || str.indexOf('\n') >= 0) {
return true;
}
}
}
return false;
}

/**
* Try to find a regular expression that will match any of the values of a keyword field, providing:
* 1. There are only a small number of distinct values of that keyword field
* 2. The number of sampled records is several times bigger than the number of distinct values
* 3. None of the values is empty or contains a line break
* 4. None of the values matches the last line of a value of some other field in the sampled records
* @return A regular expression that will match the small number of distinct values of the keyword field, or
* <code>null</code> if a suitable regular expression could not be found.
*/
static String findLowCardinalityKeywordPattern(String columnName, List<Map<String, ?>> sampleRecords, TimeoutChecker timeoutChecker) {

int maxCardinality = Math.min(LOW_CARDINALITY_MAX_SIZE, sampleRecords.size() / LOW_CARDINALITY_MIN_RATIO);

// Find the distinct values of the column, aborting if there are too many or if any contain newlines.
Set<String> values = new HashSet<>();
for (Map<String, ?> sampleRecord : sampleRecords) {
Object value = sampleRecord.get(columnName);
if (value == null) {
return null;
}
String str = value.toString();
if (str.isEmpty() || str.indexOf('\n') >= 0) {
return null;
}
values.add(str);
if (values.size() > maxCardinality) {
return null;
}
}

// Check that none of the values exist in other columns.
// In the case of field values that span multiple lines, it's the part after the last newline that matters.
for (Map<String, ?> sampleRecord : sampleRecords) {
timeoutChecker.check("keyword-based multi-line start pattern determination");
if (sampleRecord.entrySet()
.stream()
.anyMatch(entry -> entry.getKey().equals(columnName) == false && containsLastLine(values, entry.getValue()))) {
return null;
}
}

return values.stream()
.map(value -> value.replaceAll(REGEX_NEEDS_ESCAPE_PATTERN, "\\\\$1"))
.sorted()
.collect(Collectors.joining("|", "(?:", ")"));
}

/**
* @param set A set of strings.
* @param obj An object whose string representation may or may not contain line breaks.
* @return true if {@code set} contains the last line of {@code str} (i.e. the whole of {@code str} if it has no line breaks).
*/
static boolean containsLastLine(Set<String> set, Object obj) {
if (obj == null) {
return false;
}
String str = obj.toString();
int lastNewline = str.lastIndexOf('\n');
if (lastNewline >= 0) {
return set.contains(str.substring(lastNewline + 1));
} else {
return set.contains(str);
}
}

/**
* Make a regular expression that Filebeat can use to ignore the header line of the delimited file.
* (Such lines may be observed multiple times if multiple delimited files are concatenated.)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -391,37 +391,80 @@ private void addIntermediateRegex(Collection<String> snippets) {
addIntermediateRegex(overallGrokPatternBuilder, snippets);
}

public static void addIntermediateRegex(StringBuilder patternBuilder, Collection<String> snippets) {
/**
* Create a regular expression that matches all of a supplied collection of strings. The regular expression is chosen
* such that it explicitly matches the punctuation and whitespace that's common to all the strings, but uses wildcards
* to match other characters.
* @param patternBuilder The intermediate regular expression will be appended to this string builder.
* @param snippets The portions of all the sampled messages that the generated regular expression must match.
* @return The highest count of a particular explicit character in the generated pattern that is followed by a wildcard.
* This value gives an indication about the complexity of matching the returned pattern against a string that
* nearly matches but not quite.
*/
public static int addIntermediateRegex(StringBuilder patternBuilder, Collection<String> snippets) {
if (snippets.isEmpty()) {
return;
return 0;
}

List<String> others = new ArrayList<>(snippets);
String driver = others.remove(others.size() - 1);

char lastPunctOrSpace = '\0';
List<Character> charsPrecedingWildcard = new ArrayList<>();

boolean wildcardRequiredIfNonMatchFound = true;
for (int i = 0; i < driver.length(); ++i) {
char ch = driver.charAt(i);
Boolean punctuationOrSpaceNeedsEscaping = PUNCTUATION_OR_SPACE_NEEDS_ESCAPING.get(ch);
if (punctuationOrSpaceNeedsEscaping != null && others.stream().allMatch(other -> other.indexOf(ch) >= 0)) {
if (wildcardRequiredIfNonMatchFound && others.stream().anyMatch(other -> other.indexOf(ch) > 0)) {
patternBuilder.append(".*?");
charsPrecedingWildcard.add(lastPunctOrSpace);
}
if (punctuationOrSpaceNeedsEscaping) {
patternBuilder.append('\\');
}
patternBuilder.append(ch);
wildcardRequiredIfNonMatchFound = true;
others = others.stream().map(other -> other.substring(other.indexOf(ch) + 1)).collect(Collectors.toList());
lastPunctOrSpace = ch;
} else if (wildcardRequiredIfNonMatchFound) {
patternBuilder.append(".*?");
charsPrecedingWildcard.add(lastPunctOrSpace);
wildcardRequiredIfNonMatchFound = false;
}
}

if (wildcardRequiredIfNonMatchFound && others.stream().anyMatch(s -> s.isEmpty() == false)) {
patternBuilder.append(".*?");
}

return longestRun(charsPrecedingWildcard);
}

/**
* @return The length of the longest subsequence of identical values in {@code sequence}.
*/
static int longestRun(List<?> sequence) {
if (sequence.size() <= 1) {
return sequence.size();
}
int maxSoFar = 0;
int thisCount = 1;
for (int index = 1; index < sequence.size(); ++index) {
if (sequence.get(index).equals(sequence.get(index - 1))) {
++thisCount;
} else {
maxSoFar = Math.max(maxSoFar, thisCount);
// The next run cannot be longer if we're nearer the end than the max so far
if (maxSoFar >= sequence.size() - index) {
return maxSoFar;
}
thisCount = 1;
}
}
maxSoFar = Math.max(maxSoFar, thisCount);
return maxSoFar;
}

private void finalizeGrokPattern(Collection<String> snippets) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@

public class LogTextStructureFinder implements TextStructureFinder {

private static final int TOO_MANY_IDENTICAL_DELIMITERS_BEFORE_WILDCARDS = 8;
private final List<String> sampleMessages;
private final TextStructure structure;

Expand Down Expand Up @@ -250,11 +251,25 @@ static TimestampFormatFinder populateTimestampFormatFinder(
static String createMultiLineMessageStartRegex(Collection<String> prefaces, String simpleDateRegex) {

StringBuilder builder = new StringBuilder("^");
GrokPatternCreator.addIntermediateRegex(builder, prefaces);
int complexity = GrokPatternCreator.addIntermediateRegex(builder, prefaces);
builder.append(simpleDateRegex);
if (builder.substring(0, 3).equals("^\\b")) {
builder.delete(1, 3);
}
// This is here primarily to protect against the horrible patterns that are generated when a not-quite-valid-CSV file
// has its timestamp column near the end of each line. The algorithm used to produce the multi-line start patterns can
// then produce patterns like this:
// ^.*?,.*?,.*?,.*?,.*?,.*?,.*?,.*?,.*?,.*?,.*?,.*?,.*?,.*?,.*?,.*?,.*?,.*?,.*?,.*?,.*?,.*?,\\b\\d{4}-\\d{2}-\\d{2}[T ]\\d{2}:\\d{2}
// If a pattern like this is matched against a line that nearly matches but not quite (which is basically guaranteed in
// the not-quite-valid-CSV file case) then the backtracking will cause the match attempt to run for many days. Therefore
// it's better to just error out in this case and let the user try again with overrides.
if (complexity >= TOO_MANY_IDENTICAL_DELIMITERS_BEFORE_WILDCARDS) {
throw new IllegalArgumentException(
"Generated multi-line start pattern based on timestamp position ["
+ builder
+ "] is too complex. If your sample is delimited then try overriding the format."
);
}
return builder.toString();
}
}
Loading