elastic · droberts195 · Mar 18, 2022 · Mar 17, 2022 · Mar 17, 2022 · Mar 17, 2022
diff --git a/docs/changelog/85066.yaml b/docs/changelog/85066.yaml
@@ -0,0 +1,5 @@
+pr: 85066
+summary: Fixes for multi-line start patterns in text structure endpoint
+area: Machine Learning
+type: bug
+issues: []
diff --git a/...a/org/elasticsearch/xpack/textstructure/structurefinder/DelimitedTextStructureFinder.java b/...a/org/elasticsearch/xpack/textstructure/structurefinder/DelimitedTextStructureFinder.java
@@ -28,6 +28,7 @@
 import java.util.Locale;
 import java.util.Map;
 import java.util.Random;
+import java.util.Set;
 import java.util.SortedMap;
 import java.util.stream.Collectors;
 
@@ -37,6 +38,8 @@ public class DelimitedTextStructureFinder implements TextStructureFinder {
     static final String REGEX_NEEDS_ESCAPE_PATTERN = "([\\\\|()\\[\\]{}^$.+*?])";
     private static final int MAX_LEVENSHTEIN_COMPARISONS = 100;
     private static final int LONG_FIELD_THRESHOLD = 100;
+    private static final int LOW_CARDINALITY_MAX_SIZE = 5;
+    private static final int LOW_CARDINALITY_MIN_RATIO = 3;
     private final List<String> sampleMessages;
     private final TextStructure structure;
 
@@ -180,11 +183,14 @@ static DelimitedTextStructureFinder makeDelimitedTextStructureFinder(
                         explanation,
                         columnNamesList,
                         maxLinesPerMessage,
+                        delimiter,
                         delimiterPattern,
                         quotePattern,
                         fieldMappings,
+                        sampleRecords,
                         timeField.v1(),
-                        timeField.v2()
+                        timeField.v2(),
+                        timeoutChecker
                     )
                 );
 
@@ -207,11 +213,14 @@ static DelimitedTextStructureFinder makeDelimitedTextStructureFinder(
                     explanation,
                     columnNamesList,
                     maxLinesPerMessage,
+                    delimiter,
                     delimiterPattern,
                     quotePattern,
                     fieldMappings,
+                    sampleRecords,
                     null,
-                    null
+                    null,
+                    timeoutChecker
                 )
             );
         }
@@ -744,11 +753,14 @@ static String makeMultilineStartPattern(
         List<String> explanation,
         List<String> columnNames,
         int maxLinesPerMessage,
+        char delimiter,
         String delimiterPattern,
         String quotePattern,
         Map<String, Object> fieldMappings,
+        List<Map<String, ?>> sampleRecords,
         String timeFieldName,
-        TimestampFormatFinder timeFieldFormat
+        TimestampFormatFinder timeFieldFormat,
+        TimeoutChecker timeoutChecker
     ) {
 
         assert columnNames.isEmpty() == false;
@@ -781,6 +793,7 @@ static String makeMultilineStartPattern(
                         case "boolean" -> "(?:true|false)";
                         case "byte", "short", "integer", "long" -> "[+-]?\\d+";
                         case "half_float", "float", "double" -> "[+-]?(?:\\d+(?:\\.\\d+)?|\\.\\d+)(?:[eE][+-]?\\d+)?";
+                        case "keyword" -> findLowCardinalityKeywordPattern(columnName, sampleRecords, timeoutChecker);
                         default -> null;
                     };
                     if (columnPattern != null) {
@@ -797,7 +810,28 @@ static String makeMultilineStartPattern(
                     }
                 }
             }
-            builder.append(".*?").append(delimiterPattern);
+            // We need to be strict about how many delimiters precede the chosen field,
+            // so if it's not the first then we cannot tolerate the preceding fields
+            // containing the delimiter. Additionally, there's no point choosing a field
+            // after a field that sometimes contains line breaks to identify the first
+            // line.
+            if (columnValueContainsDelimiterOrLineBreak(columnName, delimiter, sampleRecords, timeoutChecker)) {
+                throw new IllegalArgumentException(
+                    "Cannot create a multi-line start pattern. "
+                        + "No suitable column to match exists before the first column whose values contain line breaks or delimiters ["
+                        + columnName
+                        + "]. If the timestamp format was not identified correctly adding an override for this may help."
+                );
+            }
+            builder.append("[^");
+            // Within a negated character class we don't want to escape special regex
+            // characters like dot, hence shouldn't use the pre-built pattern
+            if (delimiter == '\t') {
+                builder.append("\\t");
+            } else {
+                builder.append(delimiter);
+            }
+            builder.append("]*?").append(delimiterPattern);
         }
         // TODO: if this happens a lot then we should try looking for the a multi-line END pattern instead of a start pattern.
         // But this would require changing the find_structure response, and the file upload UI, and would make creating Filebeat
@@ -806,6 +840,94 @@ static String makeMultilineStartPattern(
         return null;
     }
 
+    /**
+     * @return <code>true</code> if the value of the field {@code columnName} in any record in the {@code sampleRecords}
+     *         contains the {@code delimiter} or a line break.
+     */
+    static boolean columnValueContainsDelimiterOrLineBreak(
+        String columnName,
+        char delimiter,
+        List<Map<String, ?>> sampleRecords,
+        TimeoutChecker timeoutChecker
+    ) {
+        for (Map<String, ?> sampleRecord : sampleRecords) {
+            timeoutChecker.check("delimiter search in multi-line start pattern determination");
+            Object value = sampleRecord.get(columnName);
+            if (value != null) {
+                String str = value.toString();
+                if (str.indexOf(delimiter) >= 0 || str.indexOf('\n') >= 0) {
+                    return true;
+                }
+            }
+        }
+        return false;
+    }
+
+    /**
+     * Try to find a regular expression that will match any of the values of a keyword field, providing:
+     * 1. There are only a small number of distinct values of that keyword field
+     * 2. The number of sampled records is several times bigger than the number of distinct values
+     * 3. None of the values is empty or contains a line break
+     * 4. None of the values matches the last line of a value of some other field in the sampled records
+     * @return A regular expression that will match the small number of distinct values of the keyword field, or
+     *         <code>null</code> if a suitable regular expression could not be found.
+     */
+    static String findLowCardinalityKeywordPattern(String columnName, List<Map<String, ?>> sampleRecords, TimeoutChecker timeoutChecker) {
+
+        int maxCardinality = Math.min(LOW_CARDINALITY_MAX_SIZE, sampleRecords.size() / LOW_CARDINALITY_MIN_RATIO);
+
+        // Find the distinct values of the column, aborting if there are too many or if any contain newlines.
+        Set<String> values = new HashSet<>();
+        for (Map<String, ?> sampleRecord : sampleRecords) {
+            Object value = sampleRecord.get(columnName);
+            if (value == null) {
+                return null;
+            }
+            String str = value.toString();
+            if (str.isEmpty() || str.indexOf('\n') >= 0) {
+                return null;
+            }
+            values.add(str);
+            if (values.size() > maxCardinality) {
+                return null;
+            }
+        }
+
+        // Check that none of the values exist in other columns.
+        // In the case of field values that span multiple lines, it's the part after the last newline that matters.
+        for (Map<String, ?> sampleRecord : sampleRecords) {
+            timeoutChecker.check("keyword-based multi-line start pattern determination");
+            if (sampleRecord.entrySet()
+                .stream()
+                .anyMatch(entry -> entry.getKey().equals(columnName) == false && containsLastLine(values, entry.getValue()))) {
+                return null;
+            }
+        }
+
+        return values.stream()
+            .map(value -> value.replaceAll(REGEX_NEEDS_ESCAPE_PATTERN, "\\\\$1"))
+            .sorted()
+            .collect(Collectors.joining("|", "(?:", ")"));
+    }
+
+    /**
+     * @param set A set of strings.
+     * @param obj An object whose string representation may or may not contain line breaks.
+     * @return true if {@code set} contains the last line of {@code str} (i.e. the whole of {@code str} if it has no line breaks).
+     */
+    static boolean containsLastLine(Set<String> set, Object obj) {
+        if (obj == null) {
+            return false;
+        }
+        String str = obj.toString();
+        int lastNewline = str.lastIndexOf('\n');
+        if (lastNewline >= 0) {
+            return set.contains(str.substring(lastNewline + 1));
+        } else {
+            return set.contains(str);
+        }
+    }
+
     /**
      * Make a regular expression that Filebeat can use to ignore the header line of the delimited file.
      * (Such lines may be observed multiple times if multiple delimited files are concatenated.)

diff --git a/...c/main/java/org/elasticsearch/xpack/textstructure/structurefinder/GrokPatternCreator.java b/...c/main/java/org/elasticsearch/xpack/textstructure/structurefinder/GrokPatternCreator.java
@@ -391,37 +391,80 @@ private void addIntermediateRegex(Collection<String> snippets) {
         addIntermediateRegex(overallGrokPatternBuilder, snippets);
     }
 
-    public static void addIntermediateRegex(StringBuilder patternBuilder, Collection<String> snippets) {
+    /**
+     * Create a regular expression that matches all of a supplied collection of strings. The regular expression is chosen
+     * such that it explicitly matches the punctuation and whitespace that's common to all the strings, but uses wildcards
+     * to match other characters.
+     * @param patternBuilder The intermediate regular expression will be appended to this string builder.
+     * @param snippets The portions of all the sampled messages that the generated regular expression must match.
+     * @return The highest count of a particular explicit character in the generated pattern that is followed by a wildcard.
+     *         This value gives an indication about the complexity of matching the returned pattern against a string that
+     *         nearly matches but not quite.
+     */
+    public static int addIntermediateRegex(StringBuilder patternBuilder, Collection<String> snippets) {
         if (snippets.isEmpty()) {
-            return;
+            return 0;
         }
 
         List<String> others = new ArrayList<>(snippets);
         String driver = others.remove(others.size() - 1);
 
+        char lastPunctOrSpace = '\0';
+        List<Character> charsPrecedingWildcard = new ArrayList<>();
+
         boolean wildcardRequiredIfNonMatchFound = true;
         for (int i = 0; i < driver.length(); ++i) {
             char ch = driver.charAt(i);
             Boolean punctuationOrSpaceNeedsEscaping = PUNCTUATION_OR_SPACE_NEEDS_ESCAPING.get(ch);
             if (punctuationOrSpaceNeedsEscaping != null && others.stream().allMatch(other -> other.indexOf(ch) >= 0)) {
                 if (wildcardRequiredIfNonMatchFound && others.stream().anyMatch(other -> other.indexOf(ch) > 0)) {
                     patternBuilder.append(".*?");
+                    charsPrecedingWildcard.add(lastPunctOrSpace);
                 }
                 if (punctuationOrSpaceNeedsEscaping) {
                     patternBuilder.append('\\');
                 }
                 patternBuilder.append(ch);
                 wildcardRequiredIfNonMatchFound = true;
                 others = others.stream().map(other -> other.substring(other.indexOf(ch) + 1)).collect(Collectors.toList());
+                lastPunctOrSpace = ch;
             } else if (wildcardRequiredIfNonMatchFound) {
                 patternBuilder.append(".*?");
+                charsPrecedingWildcard.add(lastPunctOrSpace);
                 wildcardRequiredIfNonMatchFound = false;
             }
         }
 
         if (wildcardRequiredIfNonMatchFound && others.stream().anyMatch(s -> s.isEmpty() == false)) {
             patternBuilder.append(".*?");
         }
+
+        return longestRun(charsPrecedingWildcard);
+    }
+
+    /**
+     * @return The length of the longest subsequence of identical values in {@code sequence}.
+     */
+    static int longestRun(List<?> sequence) {
+        if (sequence.size() <= 1) {
+            return sequence.size();
+        }
+        int maxSoFar = 0;
+        int thisCount = 1;
+        for (int index = 1; index < sequence.size(); ++index) {
+            if (sequence.get(index).equals(sequence.get(index - 1))) {
+                ++thisCount;
+            } else {
+                maxSoFar = Math.max(maxSoFar, thisCount);
+                // The next run cannot be longer if we're nearer the end than the max so far
+                if (maxSoFar >= sequence.size() - index) {
+                    return maxSoFar;
+                }
+                thisCount = 1;
+            }
+        }
+        maxSoFar = Math.max(maxSoFar, thisCount);
+        return maxSoFar;
     }
 
     private void finalizeGrokPattern(Collection<String> snippets) {

diff --git a/...in/java/org/elasticsearch/xpack/textstructure/structurefinder/LogTextStructureFinder.java b/...in/java/org/elasticsearch/xpack/textstructure/structurefinder/LogTextStructureFinder.java
@@ -23,6 +23,7 @@
 
 public class LogTextStructureFinder implements TextStructureFinder {
 
+    private static final int TOO_MANY_IDENTICAL_DELIMITERS_BEFORE_WILDCARDS = 8;
     private final List<String> sampleMessages;
     private final TextStructure structure;
 
@@ -250,11 +251,25 @@ static TimestampFormatFinder populateTimestampFormatFinder(
     static String createMultiLineMessageStartRegex(Collection<String> prefaces, String simpleDateRegex) {
 
         StringBuilder builder = new StringBuilder("^");
-        GrokPatternCreator.addIntermediateRegex(builder, prefaces);
+        int complexity = GrokPatternCreator.addIntermediateRegex(builder, prefaces);
         builder.append(simpleDateRegex);
         if (builder.substring(0, 3).equals("^\\b")) {
             builder.delete(1, 3);
         }
+        // This is here primarily to protect against the horrible patterns that are generated when a not-quite-valid-CSV file
+        // has its timestamp column near the end of each line. The algorithm used to produce the multi-line start patterns can
+        // then produce patterns like this:
+        // ^.*?,.*?,.*?,.*?,.*?,.*?,.*?,.*?,.*?,.*?,.*?,.*?,.*?,.*?,.*?,.*?,.*?,.*?,.*?,.*?,.*?,.*?,\\b\\d{4}-\\d{2}-\\d{2}[T ]\\d{2}:\\d{2}
+        // If a pattern like this is matched against a line that nearly matches but not quite (which is basically guaranteed in
+        // the not-quite-valid-CSV file case) then the backtracking will cause the match attempt to run for many days. Therefore
+        // it's better to just error out in this case and let the user try again with overrides.
+        if (complexity >= TOO_MANY_IDENTICAL_DELIMITERS_BEFORE_WILDCARDS) {
+            throw new IllegalArgumentException(
+                "Generated multi-line start pattern based on timestamp position ["
+                    + builder
+                    + "] is too complex. If your sample is delimited then try overriding the format."
+            );
+        }
         return builder.toString();
     }
 }