[ML] Fixes for multi-line start patterns in text structure endpoint (#…

…85066) (#85100) This PR contains 3 fixes for the way multi-line start patterns are created in the text structure endpoint: 1. For delimited files the multi-line start pattern used to be based on the first field that was a timestamp, boolean or number. This PR adds the option of a low cardinality keyword field as an alternative (i.e. an enum field effectively). It means there is more chance of a field early in each record being chosen as the mechanism for determining whether a line is the first line of a record. 2. The multi-line start pattern for delimited files now only permits the delimiter character between fields, not within quoted fields. Previously it was possible for the multi-line start pattern to match continuation lines. Unfortunately this may mean we can no longer determine a multi-line start pattern for files whose only suitable field is to the right of fields that sometimes contain commas, and the only solution in this case will be to reorder the columns before importing the data. Hopefully this problem will be very rare. 3. For semi-structured text log files there is now a cap on the complexity of the multi-line start pattern. It has been observed that the patterns generated for slightly malformed CSV files could run for days against the malformed lines of those files - the classic problem of a regex that doesn't match but nearly does doing lots of backtracking. We now throw an error in this situation and suggest overriding the format to delimited. Relates #79708 Fixes elastic/kibana#121966
elastic · Mar 18, 2022 · 488cf48 · 488cf48
1 parent d0925dd
commit 488cf48
Show file tree

Hide file tree

Showing 6 changed files with 378 additions and 16 deletions.
diff --git a/docs/changelog/85066.yaml b/docs/changelog/85066.yaml
@@ -0,0 +1,5 @@
+pr: 85066
+summary: Fixes for multi-line start patterns in text structure endpoint
+area: Machine Learning
+type: bug
+issues: []
diff --git a/...a/org/elasticsearch/xpack/textstructure/structurefinder/DelimitedTextStructureFinder.java b/...a/org/elasticsearch/xpack/textstructure/structurefinder/DelimitedTextStructureFinder.java
@@ -28,6 +28,7 @@
 import java.util.Locale;
 import java.util.Map;
 import java.util.Random;
+import java.util.Set;
 import java.util.SortedMap;
 import java.util.stream.Collectors;
 
@@ -37,6 +38,8 @@ public class DelimitedTextStructureFinder implements TextStructureFinder {
     static final String REGEX_NEEDS_ESCAPE_PATTERN = "([\\\\|()\\[\\]{}^$.+*?])";
     private static final int MAX_LEVENSHTEIN_COMPARISONS = 100;
     private static final int LONG_FIELD_THRESHOLD = 100;
+    private static final int LOW_CARDINALITY_MAX_SIZE = 5;
+    private static final int LOW_CARDINALITY_MIN_RATIO = 3;
     private final List<String> sampleMessages;
     private final TextStructure structure;
 
@@ -180,11 +183,14 @@ static DelimitedTextStructureFinder makeDelimitedTextStructureFinder(
                         explanation,
                         columnNamesList,
                         maxLinesPerMessage,
+                        delimiter,
                         delimiterPattern,
                         quotePattern,
                         fieldMappings,
+                        sampleRecords,
                         timeField.v1(),
-                        timeField.v2()
+                        timeField.v2(),
+                        timeoutChecker
                     )
                 );
 
@@ -207,11 +213,14 @@ static DelimitedTextStructureFinder makeDelimitedTextStructureFinder(
                     explanation,
                     columnNamesList,
                     maxLinesPerMessage,
+                    delimiter,
                     delimiterPattern,
                     quotePattern,
                     fieldMappings,
+                    sampleRecords,
                     null,
-                    null
+                    null,
+                    timeoutChecker
                 )
             );
         }
@@ -744,11 +753,14 @@ static String makeMultilineStartPattern(
         List<String> explanation,
         List<String> columnNames,
         int maxLinesPerMessage,
+        char delimiter,
         String delimiterPattern,
         String quotePattern,
         Map<String, Object> fieldMappings,
+        List<Map<String, ?>> sampleRecords,
         String timeFieldName,
-        TimestampFormatFinder timeFieldFormat
+        TimestampFormatFinder timeFieldFormat,
+        TimeoutChecker timeoutChecker
     ) {
 
         assert columnNames.isEmpty() == false;
@@ -781,6 +793,7 @@ static String makeMultilineStartPattern(
                         case "boolean" -> "(?:true|false)";
                         case "byte", "short", "integer", "long" -> "[+-]?\\d+";
                         case "half_float", "float", "double" -> "[+-]?(?:\\d+(?:\\.\\d+)?|\\.\\d+)(?:[eE][+-]?\\d+)?";
+                        case "keyword" -> findLowCardinalityKeywordPattern(columnName, sampleRecords, timeoutChecker);
                         default -> null;
                     };
                     if (columnPattern != null) {
@@ -797,7 +810,28 @@ static String makeMultilineStartPattern(
                     }
                 }
             }
-            builder.append(".*?").append(delimiterPattern);
+            // We need to be strict about how many delimiters precede the chosen field,
+            // so if it's not the first then we cannot tolerate the preceding fields
+            // containing the delimiter. Additionally, there's no point choosing a field
+            // after a field that sometimes contains line breaks to identify the first
+            // line.
+            if (columnValueContainsDelimiterOrLineBreak(columnName, delimiter, sampleRecords, timeoutChecker)) {
+                throw new IllegalArgumentException(
+                    "Cannot create a multi-line start pattern. "
+                        + "No suitable column to match exists before the first column whose values contain line breaks or delimiters ["
+                        + columnName
+                        + "]. If the timestamp format was not identified correctly adding an override for this may help."
+                );
+            }
+            builder.append("[^");
+            // Within a negated character class we don't want to escape special regex
+            // characters like dot, hence shouldn't use the pre-built pattern
+            if (delimiter == '\t') {
+                builder.append("\\t");
+            } else {
+                builder.append(delimiter);
+            }
+            builder.append("]*?").append(delimiterPattern);
         }
         // TODO: if this happens a lot then we should try looking for the a multi-line END pattern instead of a start pattern.
         // But this would require changing the find_structure response, and the file upload UI, and would make creating Filebeat
@@ -806,6 +840,94 @@ static String makeMultilineStartPattern(
         return null;
     }
 
+    /**
+     * @return <code>true</code> if the value of the field {@code columnName} in any record in the {@code sampleRecords}
+     *         contains the {@code delimiter} or a line break.
+     */
+    static boolean columnValueContainsDelimiterOrLineBreak(
+        String columnName,
+        char delimiter,
+        List<Map<String, ?>> sampleRecords,
+        TimeoutChecker timeoutChecker
+    ) {
+        for (Map<String, ?> sampleRecord : sampleRecords) {
+            timeoutChecker.check("delimiter search in multi-line start pattern determination");
+            Object value = sampleRecord.get(columnName);
+            if (value != null) {
+                String str = value.toString();
+                if (str.indexOf(delimiter) >= 0 || str.indexOf('\n') >= 0) {
+                    return true;
+                }
+            }
+        }
+        return false;
+    }
+
+    /**
+     * Try to find a regular expression that will match any of the values of a keyword field, providing:
+     * 1. There are only a small number of distinct values of that keyword field
+     * 2. The number of sampled records is several times bigger than the number of distinct values
+     * 3. None of the values is empty or contains a line break
+     * 4. None of the values matches the last line of a value of some other field in the sampled records
+     * @return A regular expression that will match the small number of distinct values of the keyword field, or
+     *         <code>null</code> if a suitable regular expression could not be found.
+     */
+    static String findLowCardinalityKeywordPattern(String columnName, List<Map<String, ?>> sampleRecords, TimeoutChecker timeoutChecker) {
+
+        int maxCardinality = Math.min(LOW_CARDINALITY_MAX_SIZE, sampleRecords.size() / LOW_CARDINALITY_MIN_RATIO);
+
+        // Find the distinct values of the column, aborting if there are too many or if any contain newlines.
+        Set<String> values = new HashSet<>();
+        for (Map<String, ?> sampleRecord : sampleRecords) {
+            Object value = sampleRecord.get(columnName);
+            if (value == null) {
+                return null;
+            }
+            String str = value.toString();
+            if (str.isEmpty() || str.indexOf('\n') >= 0) {
+                return null;
+            }
+            values.add(str);
+            if (values.size() > maxCardinality) {
+                return null;
+            }
+        }
+
+        // Check that none of the values exist in other columns.
+        // In the case of field values that span multiple lines, it's the part after the last newline that matters.
+        for (Map<String, ?> sampleRecord : sampleRecords) {
+            timeoutChecker.check("keyword-based multi-line start pattern determination");
+            if (sampleRecord.entrySet()
+                .stream()
+                .anyMatch(entry -> entry.getKey().equals(columnName) == false && containsLastLine(values, entry.getValue()))) {
+                return null;
+            }
+        }
+
+        return values.stream()
+            .map(value -> value.replaceAll(REGEX_NEEDS_ESCAPE_PATTERN, "\\\\$1"))
+            .sorted()
+            .collect(Collectors.joining("|", "(?:", ")"));
+    }
+
+    /**
+     * @param set A set of strings.
+     * @param obj An object whose string representation may or may not contain line breaks.
+     * @return true if {@code set} contains the last line of {@code str} (i.e. the whole of {@code str} if it has no line breaks).
+     */
+    static boolean containsLastLine(Set<String> set, Object obj) {
+        if (obj == null) {
+            return false;
+        }
+        String str = obj.toString();
+        int lastNewline = str.lastIndexOf('\n');
+        if (lastNewline >= 0) {
+            return set.contains(str.substring(lastNewline + 1));
+        } else {
+            return set.contains(str);
+        }
+    }
+
     /**
      * Make a regular expression that Filebeat can use to ignore the header line of the delimited file.
      * (Such lines may be observed multiple times if multiple delimited files are concatenated.)

diff --git a/...c/main/java/org/elasticsearch/xpack/textstructure/structurefinder/GrokPatternCreator.java b/...c/main/java/org/elasticsearch/xpack/textstructure/structurefinder/GrokPatternCreator.java
@@ -391,37 +391,80 @@ private void addIntermediateRegex(Collection<String> snippets) {
         addIntermediateRegex(overallGrokPatternBuilder, snippets);
     }
 
-    public static void addIntermediateRegex(StringBuilder patternBuilder, Collection<String> snippets) {
+    /**
+     * Create a regular expression that matches all of a supplied collection of strings. The regular expression is chosen
+     * such that it explicitly matches the punctuation and whitespace that's common to all the strings, but uses wildcards
+     * to match other characters.
+     * @param patternBuilder The intermediate regular expression will be appended to this string builder.
+     * @param snippets The portions of all the sampled messages that the generated regular expression must match.
+     * @return The highest count of a particular explicit character in the generated pattern that is followed by a wildcard.
+     *         This value gives an indication about the complexity of matching the returned pattern against a string that
+     *         nearly matches but not quite.
+     */
+    public static int addIntermediateRegex(StringBuilder patternBuilder, Collection<String> snippets) {
         if (snippets.isEmpty()) {
-            return;
+            return 0;
         }
 
         List<String> others = new ArrayList<>(snippets);
         String driver = others.remove(others.size() - 1);
 
+        char lastPunctOrSpace = '\0';
+        List<Character> charsPrecedingWildcard = new ArrayList<>();
+
         boolean wildcardRequiredIfNonMatchFound = true;
         for (int i = 0; i < driver.length(); ++i) {
             char ch = driver.charAt(i);
             Boolean punctuationOrSpaceNeedsEscaping = PUNCTUATION_OR_SPACE_NEEDS_ESCAPING.get(ch);
             if (punctuationOrSpaceNeedsEscaping != null && others.stream().allMatch(other -> other.indexOf(ch) >= 0)) {
                 if (wildcardRequiredIfNonMatchFound && others.stream().anyMatch(other -> other.indexOf(ch) > 0)) {
                     patternBuilder.append(".*?");
+                    charsPrecedingWildcard.add(lastPunctOrSpace);
                 }
                 if (punctuationOrSpaceNeedsEscaping) {
                     patternBuilder.append('\\');
                 }
                 patternBuilder.append(ch);
                 wildcardRequiredIfNonMatchFound = true;
                 others = others.stream().map(other -> other.substring(other.indexOf(ch) + 1)).collect(Collectors.toList());
+                lastPunctOrSpace = ch;
             } else if (wildcardRequiredIfNonMatchFound) {
                 patternBuilder.append(".*?");
+                charsPrecedingWildcard.add(lastPunctOrSpace);
                 wildcardRequiredIfNonMatchFound = false;
             }
         }
 
         if (wildcardRequiredIfNonMatchFound && others.stream().anyMatch(s -> s.isEmpty() == false)) {
             patternBuilder.append(".*?");
         }
+
+        return longestRun(charsPrecedingWildcard);
+    }
+
+    /**
+     * @return The length of the longest subsequence of identical values in {@code sequence}.
+     */
+    static int longestRun(List<?> sequence) {
+        if (sequence.size() <= 1) {
+            return sequence.size();
+        }
+        int maxSoFar = 0;
+        int thisCount = 1;
+        for (int index = 1; index < sequence.size(); ++index) {
+            if (sequence.get(index).equals(sequence.get(index - 1))) {
+                ++thisCount;
+            } else {
+                maxSoFar = Math.max(maxSoFar, thisCount);
+                // The next run cannot be longer if we're nearer the end than the max so far
+                if (maxSoFar >= sequence.size() - index) {
+                    return maxSoFar;
+                }
+                thisCount = 1;
+            }
+        }
+        maxSoFar = Math.max(maxSoFar, thisCount);
+        return maxSoFar;
     }
 
     private void finalizeGrokPattern(Collection<String> snippets) {

diff --git a/...in/java/org/elasticsearch/xpack/textstructure/structurefinder/LogTextStructureFinder.java b/...in/java/org/elasticsearch/xpack/textstructure/structurefinder/LogTextStructureFinder.java
@@ -23,6 +23,7 @@
 
 public class LogTextStructureFinder implements TextStructureFinder {
 
+    private static final int TOO_MANY_IDENTICAL_DELIMITERS_BEFORE_WILDCARDS = 8;
     private final List<String> sampleMessages;
     private final TextStructure structure;
 
@@ -250,11 +251,25 @@ static TimestampFormatFinder populateTimestampFormatFinder(
     static String createMultiLineMessageStartRegex(Collection<String> prefaces, String simpleDateRegex) {
 
         StringBuilder builder = new StringBuilder("^");
-        GrokPatternCreator.addIntermediateRegex(builder, prefaces);
+        int complexity = GrokPatternCreator.addIntermediateRegex(builder, prefaces);
         builder.append(simpleDateRegex);
         if (builder.substring(0, 3).equals("^\\b")) {
             builder.delete(1, 3);
         }
+        // This is here primarily to protect against the horrible patterns that are generated when a not-quite-valid-CSV file
+        // has its timestamp column near the end of each line. The algorithm used to produce the multi-line start patterns can
+        // then produce patterns like this:
+        // ^.*?,.*?,.*?,.*?,.*?,.*?,.*?,.*?,.*?,.*?,.*?,.*?,.*?,.*?,.*?,.*?,.*?,.*?,.*?,.*?,.*?,.*?,\\b\\d{4}-\\d{2}-\\d{2}[T ]\\d{2}:\\d{2}
+        // If a pattern like this is matched against a line that nearly matches but not quite (which is basically guaranteed in
+        // the not-quite-valid-CSV file case) then the backtracking will cause the match attempt to run for many days. Therefore
+        // it's better to just error out in this case and let the user try again with overrides.
+        if (complexity >= TOO_MANY_IDENTICAL_DELIMITERS_BEFORE_WILDCARDS) {
+            throw new IllegalArgumentException(
+                "Generated multi-line start pattern based on timestamp position ["
+                    + builder
+                    + "] is too complex. If your sample is delimited then try overriding the format."
+            );
+        }
         return builder.toString();
     }
 }