diff --git a/docs/reference/ml/apis/find-file-structure.asciidoc b/docs/reference/ml/apis/find-file-structure.asciidoc index 9c21d2a88b49a..e9d9da479c0f2 100644 --- a/docs/reference/ml/apis/find-file-structure.asciidoc +++ b/docs/reference/ml/apis/find-file-structure.asciidoc @@ -147,57 +147,46 @@ is not compulsory to have a timestamp in the file. -- `timestamp_format`:: - (string) The time format of the timestamp field in the file. + + (string) The Java time format of the timestamp field in the file. + + -- -NOTE: Currently there is a limitation that this format must be one that the -structure finder might choose by itself. The reason for this restriction is that -to consistently set all the fields in the response the structure finder needs a -corresponding Grok pattern name and simple regular expression for each timestamp -format. Therefore, there is little value in specifying this parameter for -structured file formats. If you know which field contains your primary timestamp, -it is as good and less error-prone to just specify `timestamp_field`. - -The valuable use case for this parameter is when the format is semi-structured +NOTE: Only a subset of Java time format letter groups are supported: + +* `a` +* `d` +* `dd` +* `EEE` +* `EEEE` +* `H` +* `HH` +* `h` +* `M` +* `MM` +* `MMM` +* `MMMM` +* `mm` +* `ss` +* `XX` +* `XXX` +* `yy` +* `yyyy` +* `zzz` + +Additionally `S` letter groups (fractional seconds) of length one to nine are +supported providing they occur after `ss` and separated from the `ss` by a `.`, +`,` or `:`. Spacing and punctuation is also permitted with the exception of `?`, +newline and carriage return, together with literal text enclosed in single +quotes. For example, `MM/dd HH.mm.ss,SSSSSS 'in' yyyy` is a valid override +format. + +One valuable use case for this parameter is when the format is semi-structured text, there are multiple timestamp formats in the file, and you know which format corresponds to the primary timestamp, but you do not want to specify the -full `grok_pattern`. - -If this parameter is not specified, the structure finder chooses the best format from -the formats it knows, which are these Java time formats: - -* `dd/MMM/yyyy:HH:mm:ss XX` -* `EEE MMM dd HH:mm zzz yyyy` -* `EEE MMM dd HH:mm:ss yyyy` -* `EEE MMM dd HH:mm:ss zzz yyyy` -* `EEE MMM dd yyyy HH:mm zzz` -* `EEE MMM dd yyyy HH:mm:ss zzz` -* `EEE, dd MMM yyyy HH:mm XX` -* `EEE, dd MMM yyyy HH:mm XXX` -* `EEE, dd MMM yyyy HH:mm:ss XX` -* `EEE, dd MMM yyyy HH:mm:ss XXX` -* `ISO8601` -* `MMM d HH:mm:ss` -* `MMM d HH:mm:ss,SSS` -* `MMM d yyyy HH:mm:ss` -* `MMM dd HH:mm:ss` -* `MMM dd HH:mm:ss,SSS` -* `MMM dd yyyy HH:mm:ss` -* `MMM dd, yyyy h:mm:ss a` -* `TAI64N` -* `UNIX` -* `UNIX_MS` -* `yyyy-MM-dd HH:mm:ss` -* `yyyy-MM-dd HH:mm:ss,SSS` -* `yyyy-MM-dd HH:mm:ss,SSS XX` -* `yyyy-MM-dd HH:mm:ss,SSSXX` -* `yyyy-MM-dd HH:mm:ss,SSSXXX` -* `yyyy-MM-dd HH:mm:ssXX` -* `yyyy-MM-dd HH:mm:ssXXX` -* `yyyy-MM-dd'T'HH:mm:ss,SSS` -* `yyyy-MM-dd'T'HH:mm:ss,SSSXX` -* `yyyy-MM-dd'T'HH:mm:ss,SSSXXX` -* `yyyyMMddHHmmss` +full `grok_pattern`. Another is when the timestamp format is one that the +structure finder does not consider by default. + +If this parameter is not specified, the structure finder chooses the best +format from a built-in set. -- @@ -263,8 +252,18 @@ If the request does not encounter errors, you receive the following result: "charset" : "UTF-8", <4> "has_byte_order_marker" : false, <5> "format" : "ndjson", <6> - "need_client_timezone" : false, <7> - "mappings" : { <8> + "timestamp_field" : "release_date", <7> + "joda_timestamp_formats" : [ <8> + "ISO8601" + ], + "java_timestamp_formats" : [ <9> + "ISO8601" + ], + "need_client_timezone" : true, <10> + "mappings" : { <11> + "@timestamp" : { + "type" : "date" + }, "author" : { "type" : "keyword" }, @@ -275,10 +274,25 @@ If the request does not encounter errors, you receive the following result: "type" : "long" }, "release_date" : { - "type" : "keyword" + "type" : "date", + "format" : "iso8601" } }, - "field_stats" : { <9> + "ingest_pipeline" : { + "description" : "Ingest pipeline created by file structure finder", + "processors" : [ + { + "date" : { + "field" : "release_date", + "timezone" : "{{ beat.timezone }}", + "formats" : [ + "ISO8601" + ] + } + } + ] + }, + "field_stats" : { <12> "author" : { "count" : 24, "cardinality" : 20, @@ -484,17 +498,22 @@ If the request does not encounter errors, you receive the following result: <5> For UTF character encodings, `has_byte_order_marker` indicates whether the file begins with a byte order marker. <6> `format` is one of `ndjson`, `xml`, `delimited` or `semi_structured_text`. -<7> If a timestamp format is detected that does not include a timezone, - `need_client_timezone` will be `true`. The server that parses the file must - therefore be told the correct timezone by the client. -<8> `mappings` contains some suitable mappings for an index into which the data - could be ingested. In this case, the `release_date` field has been given a - `keyword` type as it is not considered specific enough to convert to the - `date` type. -<9> `field_stats` contains the most common values of each field, plus basic - numeric statistics for the numeric `page_count` field. This information - may provide clues that the data needs to be cleaned or transformed prior - to use by other {ml} functionality. +<7> The `timestamp_field` names the field considered most likely to be the + primary timestamp of each document. +<8> `joda_timestamp_formats` are used to tell Logstash how to parse timestamps. +<9> `java_timestamp_formats` are the Java time formats recognized in the time + fields. Elasticsearch mappings and Ingest pipeline use this format. +<10> If a timestamp format is detected that does not include a timezone, + `need_client_timezone` will be `true`. The server that parses the file must + therefore be told the correct timezone by the client. +<11> `mappings` contains some suitable mappings for an index into which the data + could be ingested. In this case, the `release_date` field has been given a + `keyword` type as it is not considered specific enough to convert to the + `date` type. +<12> `field_stats` contains the most common values of each field, plus basic + numeric statistics for the numeric `page_count` field. This information + may provide clues that the data needs to be cleaned or transformed prior + to use by other {ml} functionality. The next example shows how it's possible to find the structure of some New York City yellow cab trip data. The first `curl` command downloads the data, the @@ -526,7 +545,7 @@ If the request does not encounter errors, you receive the following result: "charset" : "UTF-8", "has_byte_order_marker" : false, "format" : "delimited", <2> - "multiline_start_pattern" : "^.*?,\"?\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2}", + "multiline_start_pattern" : "^.*?,\"?\\d{4}-\\d{2}-\\d{2}[T ]\\d{2}:\\d{2}", "exclude_lines_pattern" : "^\"?VendorID\"?,\"?tpep_pickup_datetime\"?,\"?tpep_dropoff_datetime\"?,\"?passenger_count\"?,\"?trip_distance\"?,\"?RatecodeID\"?,\"?store_and_fwd_flag\"?,\"?PULocationID\"?,\"?DOLocationID\"?,\"?payment_type\"?,\"?fare_amount\"?,\"?extra\"?,\"?mta_tax\"?,\"?tip_amount\"?,\"?tolls_amount\"?,\"?improvement_surcharge\"?,\"?total_amount\"?", "column_names" : [ <3> "VendorID", @@ -1361,14 +1380,14 @@ this: "charset" : "UTF-8", "has_byte_order_marker" : false, "format" : "semi_structured_text", <1> - "multiline_start_pattern" : "^\\[\\b\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2},\\d{3}", <2> + "multiline_start_pattern" : "^\\[\\b\\d{4}-\\d{2}-\\d{2}[T ]\\d{2}:\\d{2}", <2> "grok_pattern" : "\\[%{TIMESTAMP_ISO8601:timestamp}\\]\\[%{LOGLEVEL:loglevel}.*", <3> "timestamp_field" : "timestamp", "joda_timestamp_formats" : [ "ISO8601" ], "java_timestamp_formats" : [ - "yyyy-MM-dd'T'HH:mm:ss,SSS" + "ISO8601" ], "need_client_timezone" : true, "mappings" : { @@ -1398,7 +1417,7 @@ this: "field" : "timestamp", "timezone" : "{{ beat.timezone }}", "formats" : [ - "yyyy-MM-dd'T'HH:mm:ss,SSS" + "ISO8601" ] } }, @@ -1515,14 +1534,14 @@ this: "charset" : "UTF-8", "has_byte_order_marker" : false, "format" : "semi_structured_text", - "multiline_start_pattern" : "^\\[\\b\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2},\\d{3}", + "multiline_start_pattern" : "^\\[\\b\\d{4}-\\d{2}-\\d{2}[T ]\\d{2}:\\d{2}", "grok_pattern" : "\\[%{TIMESTAMP_ISO8601:timestamp}\\]\\[%{LOGLEVEL:loglevel} *\\]\\[%{JAVACLASS:class} *\\] \\[%{HOSTNAME:node}\\] %{JAVALOGMESSAGE:message}", <1> "timestamp_field" : "timestamp", "joda_timestamp_formats" : [ "ISO8601" ], "java_timestamp_formats" : [ - "yyyy-MM-dd'T'HH:mm:ss,SSS" + "ISO8601" ], "need_client_timezone" : true, "mappings" : { @@ -1558,7 +1577,7 @@ this: "field" : "timestamp", "timezone" : "{{ beat.timezone }}", "formats" : [ - "yyyy-MM-dd'T'HH:mm:ss,SSS" + "ISO8601" ] } }, diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/DelimitedFileStructureFinder.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/DelimitedFileStructureFinder.java index dd30c0a1f94bc..aa88905962638 100644 --- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/DelimitedFileStructureFinder.java +++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/DelimitedFileStructureFinder.java @@ -8,7 +8,6 @@ import org.elasticsearch.common.collect.Tuple; import org.elasticsearch.xpack.core.ml.filestructurefinder.FieldStats; import org.elasticsearch.xpack.core.ml.filestructurefinder.FileStructure; -import org.elasticsearch.xpack.ml.filestructurefinder.TimestampFormatFinder.TimestampMatch; import org.supercsv.exception.SuperCsvException; import org.supercsv.io.CsvListReader; import org.supercsv.prefs.CsvPreference; @@ -27,7 +26,6 @@ import java.util.Map; import java.util.Random; import java.util.SortedMap; -import java.util.regex.Pattern; import java.util.stream.Collectors; import java.util.stream.IntStream; @@ -62,7 +60,7 @@ static DelimitedFileStructureFinder makeDelimitedFileStructureFinder(List (field == null) ? null : field.trim()).collect(Collectors.toList()) : row); sampleRecords.add(sampleRecord); sampleMessages.add( - sampleLines.subList(prevMessageEndLineNumber + 1, lineNumbers.get(index)).stream().collect(Collectors.joining("\n"))); + String.join("\n", sampleLines.subList(prevMessageEndLineNumber + 1, lineNumbers.get(index)))); prevMessageEndLineNumber = lineNumber; } - String preamble = Pattern.compile("\n").splitAsStream(sample).limit(lineNumbers.get(1)).collect(Collectors.joining("\n", "", "\n")); + String preamble = String.join("\n", sampleLines.subList(0, lineNumbers.get(1))) + "\n"; + + // null to allow GC before timestamp search + sampleLines = null; char delimiter = (char) csvPreference.getDelimiterChar(); FileStructure.Builder structureBuilder = new FileStructure.Builder(FileStructure.Format.DELIMITED) @@ -107,7 +108,7 @@ static DelimitedFileStructureFinder makeDelimitedFileStructureFinder(List timeField = FileStructureUtils.guessTimestampField(explanation, sampleRecords, overrides, + Tuple timeField = FileStructureUtils.guessTimestampField(explanation, sampleRecords, overrides, timeoutChecker); if (timeField != null) { String timeLineRegex = null; @@ -119,7 +120,7 @@ static DelimitedFileStructureFinder makeDelimitedFileStructureFinder(Listnull if + * @return A tuple of (field name, timestamp format finder) if one can be found, or null if * there is no consistent timestamp. */ - static Tuple guessTimestampField(List explanation, List> sampleRecords, - FileStructureOverrides overrides, TimeoutChecker timeoutChecker) { + static Tuple guessTimestampField(List explanation, List> sampleRecords, + FileStructureOverrides overrides, TimeoutChecker timeoutChecker) { if (sampleRecords.isEmpty()) { return null; } + StringBuilder exceptionMsg = null; + // Accept the first match from the first sample that is compatible with all the other samples - for (Tuple candidate : findCandidates(explanation, sampleRecords, overrides, timeoutChecker)) { + for (Tuple candidate : findCandidates(explanation, sampleRecords, overrides, timeoutChecker)) { + + String fieldName = candidate.v1(); + TimestampFormatFinder timestampFormatFinder = candidate.v2(); boolean allGood = true; for (Map sampleRecord : sampleRecords.subList(1, sampleRecords.size())) { - Object fieldValue = sampleRecord.get(candidate.v1()); + Object fieldValue = sampleRecord.get(fieldName); if (fieldValue == null) { if (overrides.getTimestampField() != null) { throw new IllegalArgumentException("Specified timestamp field [" + overrides.getTimestampField() + "] is not present in record [" + sampleRecord + "]"); } - explanation.add("First sample match [" + candidate.v1() + "] ruled out because record [" + sampleRecord + + explanation.add("First sample match [" + fieldName + "] ruled out because record [" + sampleRecord + "] doesn't have field"); allGood = false; break; @@ -88,15 +91,20 @@ static Tuple guessTimestampField(List explanatio timeoutChecker.check("timestamp field determination"); - TimestampMatch match = TimestampFormatFinder.findFirstFullMatch(fieldValue.toString(), overrides.getTimestampFormat(), - timeoutChecker); - if (match == null || match.candidateIndex != candidate.v2().candidateIndex) { + try { + timestampFormatFinder.addSample(fieldValue.toString()); + } catch (IllegalArgumentException e) { if (overrides.getTimestampFormat() != null) { - throw new IllegalArgumentException("Specified timestamp format [" + overrides.getTimestampFormat() + - "] does not match for record [" + sampleRecord + "]"); + if (exceptionMsg == null) { + exceptionMsg = new StringBuilder("Specified timestamp format [" + overrides.getTimestampFormat() + + "] does not match"); + } else { + exceptionMsg.append(", nor"); + } + exceptionMsg.append(" for record [").append(sampleRecord).append("] in field [").append(fieldName).append("]"); } - explanation.add("First sample match [" + candidate.v1() + "] ruled out because record [" + sampleRecord + - "] matches differently: [" + match + "]"); + explanation.add("First sample match " + timestampFormatFinder.getRawJavaTimestampFormats() + + " ruled out because record [" + sampleRecord + "] does not match"); allGood = false; break; } @@ -104,16 +112,21 @@ static Tuple guessTimestampField(List explanatio if (allGood) { explanation.add(((overrides.getTimestampField() == null) ? "Guessing timestamp" : "Timestamp") + - " field is [" + candidate.v1() + "] with format [" + candidate.v2() + "]"); + " field is [" + fieldName + "] with format " + timestampFormatFinder.getJavaTimestampFormats()); return candidate; } } + if (exceptionMsg != null) { + throw new IllegalArgumentException(exceptionMsg.toString()); + } + return null; } - private static List> findCandidates(List explanation, List> sampleRecords, - FileStructureOverrides overrides, TimeoutChecker timeoutChecker) { + private static List> findCandidates(List explanation, List> sampleRecords, + FileStructureOverrides overrides, + TimeoutChecker timeoutChecker) { assert sampleRecords.isEmpty() == false; Map firstRecord = sampleRecords.get(0); @@ -124,7 +137,7 @@ private static List> findCandidates(List e "] is not present in record [" + firstRecord + "]"); } - List> candidates = new ArrayList<>(); + List> candidates = new ArrayList<>(); // Get candidate timestamps from the possible field(s) of the first sample record for (Map.Entry field : firstRecord.entrySet()) { @@ -132,12 +145,17 @@ private static List> findCandidates(List e if (onlyConsiderField == null || onlyConsiderField.equals(fieldName)) { Object value = field.getValue(); if (value != null) { - TimestampMatch match = TimestampFormatFinder.findFirstFullMatch(value.toString(), overrides.getTimestampFormat(), - timeoutChecker); - if (match != null) { - Tuple candidate = new Tuple<>(fieldName, match); - candidates.add(candidate); - explanation.add("First sample timestamp match [" + candidate + "]"); + // Construct the TimestampFormatFinder outside the no-op catch because an exception + // from the constructor indicates a problem with the overridden format + TimestampFormatFinder timestampFormatFinder = + new TimestampFormatFinder(explanation, overrides.getTimestampFormat(), true, true, true, timeoutChecker); + try { + timestampFormatFinder.addSample(value.toString()); + candidates.add(new Tuple<>(fieldName, timestampFormatFinder)); + explanation.add("First sample timestamp match " + timestampFormatFinder.getRawJavaTimestampFormats() + + " for field [" + fieldName + "]"); + } catch (IllegalArgumentException e) { + // No possible timestamp format found in this particular field - not a problem } } } @@ -231,6 +249,27 @@ private static Stream flatten(Object value) { } } + /** + * Finds the appropriate date mapping for a collection of field values. Throws + * {@link IllegalArgumentException} if no consistent date mapping can be found. + * @param explanation List of reasons for choosing the overall file structure. This list + * may be non-empty when the method is called, and this method may + * append to it. + * @param fieldValues Values of the field for which mappings are to be guessed. The guessed + * mapping will be compatible with all the provided values. Must not be + * empty. + * @param timeoutChecker Will abort the operation if its timeout is exceeded. + * @return The sub-section of the index mappings most appropriate for the field. + */ + static Map findTimestampMapping(List explanation, Collection fieldValues, + TimeoutChecker timeoutChecker) { + assert fieldValues.isEmpty() == false; + + TimestampFormatFinder timestampFormatFinder = new TimestampFormatFinder(explanation, true, true, true, timeoutChecker); + fieldValues.forEach(timestampFormatFinder::addSample); + return timestampFormatFinder.getEsDateMappingTypeWithFormat(); + } + /** * Given some sample values for a field, guess the most appropriate index mapping for the * field. @@ -247,26 +286,17 @@ private static Stream flatten(Object value) { */ static Map guessScalarMapping(List explanation, String fieldName, Collection fieldValues, TimeoutChecker timeoutChecker) { - assert fieldValues.isEmpty() == false; if (fieldValues.stream().allMatch(value -> "true".equals(value) || "false".equals(value))) { return Collections.singletonMap(MAPPING_TYPE_SETTING, "boolean"); } - // This checks if a date mapping would be appropriate, and, if so, finds the correct format - Iterator iter = fieldValues.iterator(); - TimestampMatch timestampMatch = TimestampFormatFinder.findFirstFullMatch(iter.next(), timeoutChecker); - while (timestampMatch != null && iter.hasNext()) { - // To be mapped as type date all the values must match the same timestamp format - it is - // not acceptable for all values to be dates, but with different formats - if (timestampMatch.equals(TimestampFormatFinder.findFirstFullMatch(iter.next(), timestampMatch.candidateIndex, - timeoutChecker)) == false) { - timestampMatch = null; - } - } - if (timestampMatch != null) { - return timestampMatch.getEsDateMappingTypeWithFormat(); + try { + return findTimestampMapping(explanation, fieldValues, timeoutChecker); + } catch (IllegalArgumentException e) { + // To be mapped as type "date" all the values must match the same timestamp format - if + // they don't we'll end up here, and move on to try other possible mappings } if (fieldValues.stream().allMatch(NUMBER_GROK::match)) { @@ -321,6 +351,7 @@ static boolean isMoreLikelyTextThanKeyword(String str) { * Create an ingest pipeline definition appropriate for the file structure. * @param grokPattern The Grok pattern used for parsing semi-structured text formats. null for * fully structured formats. + * @param customGrokPatternDefinitions The definitions for any custom patterns that {@code grokPattern} uses. * @param timestampField The input field containing the timestamp to be parsed into @timestamp. * null if there is no timestamp. * @param timestampFormats Timestamp formats to be used for parsing {@code timestampField}. @@ -328,7 +359,8 @@ static boolean isMoreLikelyTextThanKeyword(String str) { * @param needClientTimezone Is the timezone of the client supplying data to ingest required to uniquely parse the timestamp? * @return The ingest pipeline definition, or null if none is required. */ - public static Map makeIngestPipelineDefinition(String grokPattern, String timestampField, List timestampFormats, + public static Map makeIngestPipelineDefinition(String grokPattern, Map customGrokPatternDefinitions, + String timestampField, List timestampFormats, boolean needClientTimezone) { if (grokPattern == null && timestampField == null) { @@ -344,7 +376,12 @@ public static Map makeIngestPipelineDefinition(String grokPatter Map grokProcessorSettings = new LinkedHashMap<>(); grokProcessorSettings.put("field", "message"); grokProcessorSettings.put("patterns", Collections.singletonList(grokPattern)); + if (customGrokPatternDefinitions.isEmpty() == false) { + grokProcessorSettings.put("pattern_definitions", customGrokPatternDefinitions); + } processors.add(Collections.singletonMap("grok", grokProcessorSettings)); + } else { + assert customGrokPatternDefinitions.isEmpty(); } if (timestampField != null) { diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/GrokPatternCreator.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/GrokPatternCreator.java index 6620afcb7145b..7a5c9a48f8757 100644 --- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/GrokPatternCreator.java +++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/GrokPatternCreator.java @@ -8,7 +8,6 @@ import org.elasticsearch.common.collect.Tuple; import org.elasticsearch.grok.Grok; import org.elasticsearch.xpack.core.ml.filestructurefinder.FieldStats; -import org.elasticsearch.xpack.ml.filestructurefinder.TimestampFormatFinder.TimestampMatch; import java.util.ArrayList; import java.util.Arrays; @@ -18,6 +17,7 @@ import java.util.LinkedHashSet; import java.util.List; import java.util.Map; +import java.util.Objects; import java.util.Set; import java.util.regex.Matcher; import java.util.regex.Pattern; @@ -76,10 +76,12 @@ public final class GrokPatternCreator { new ValueOnlyGrokPatternCandidate("DATESTAMP_RFC2822", "date", "extra_timestamp"), new ValueOnlyGrokPatternCandidate("DATESTAMP_OTHER", "date", "extra_timestamp"), new ValueOnlyGrokPatternCandidate("DATESTAMP_EVENTLOG", "date", "extra_timestamp"), + new ValueOnlyGrokPatternCandidate("HTTPDERROR_DATE", "date", "extra_timestamp"), new ValueOnlyGrokPatternCandidate("SYSLOGTIMESTAMP", "date", "extra_timestamp"), new ValueOnlyGrokPatternCandidate("HTTPDATE", "date", "extra_timestamp"), new ValueOnlyGrokPatternCandidate("CATALINA_DATESTAMP", "date", "extra_timestamp"), new ValueOnlyGrokPatternCandidate("CISCOTIMESTAMP", "date", "extra_timestamp"), + new ValueOnlyGrokPatternCandidate("DATESTAMP", "date", "extra_timestamp"), new ValueOnlyGrokPatternCandidate("LOGLEVEL", "keyword", "loglevel"), new ValueOnlyGrokPatternCandidate("URI", "keyword", "uri"), new ValueOnlyGrokPatternCandidate("UUID", "keyword", "uuid"), @@ -90,7 +92,8 @@ public final class GrokPatternCreator { // TODO: would be nice to have IPORHOST here, but HOSTNAME matches almost all words new ValueOnlyGrokPatternCandidate("IP", "ip", "ipaddress"), new ValueOnlyGrokPatternCandidate("DATE", "date", "date"), - new ValueOnlyGrokPatternCandidate("TIME", "date", "time"), + // A time with no date cannot be stored in a field of type "date", hence "keyword" + new ValueOnlyGrokPatternCandidate("TIME", "keyword", "time"), // This already includes pre/post break conditions new ValueOnlyGrokPatternCandidate("QUOTEDSTRING", "keyword", "field", "", ""), // Disallow +, - and . before numbers, as well as "word" characters, otherwise we'll pick @@ -121,6 +124,7 @@ public final class GrokPatternCreator { */ private final Map mappings; private final Map fieldStats; + private final Map grokPatternDefinitions; private final Map fieldNameCountStore = new HashMap<>(); private final StringBuilder overallGrokPatternBuilder = new StringBuilder(); private final TimeoutChecker timeoutChecker; @@ -131,16 +135,24 @@ public final class GrokPatternCreator { * can be appended by the methods of this class. * @param sampleMessages Sample messages that any Grok pattern found must match. * @param mappings Will be updated with mappings appropriate for the returned pattern, if non-null. - * @param timeoutChecker Will abort the operation if its timeout is exceeded. * @param fieldStats Will be updated with field stats for the fields in the returned pattern, if non-null. + * @param customGrokPatternDefinitions Custom Grok pattern definitions to add to the built-in ones. + * @param timeoutChecker Will abort the operation if its timeout is exceeded. */ public GrokPatternCreator(List explanation, Collection sampleMessages, Map mappings, - Map fieldStats, TimeoutChecker timeoutChecker) { - this.explanation = explanation; + Map fieldStats, Map customGrokPatternDefinitions, + TimeoutChecker timeoutChecker) { + this.explanation = Objects.requireNonNull(explanation); this.sampleMessages = Collections.unmodifiableCollection(sampleMessages); this.mappings = mappings; this.fieldStats = fieldStats; - this.timeoutChecker = timeoutChecker; + if (customGrokPatternDefinitions.isEmpty()) { + grokPatternDefinitions = Grok.getBuiltinPatterns(); + } else { + grokPatternDefinitions = new HashMap<>(Grok.getBuiltinPatterns()); + grokPatternDefinitions.putAll(customGrokPatternDefinitions); + } + this.timeoutChecker = Objects.requireNonNull(timeoutChecker); } /** @@ -171,7 +183,8 @@ public Tuple findFullLineGrokPattern(String timestampField) { */ public void validateFullLineGrokPattern(String grokPattern, String timestampField) { - FullMatchGrokPatternCandidate candidate = FullMatchGrokPatternCandidate.fromGrokPattern(grokPattern, timestampField); + FullMatchGrokPatternCandidate candidate = FullMatchGrokPatternCandidate.fromGrokPattern(grokPattern, timestampField, + grokPatternDefinitions); if (candidate.matchesAll(sampleMessages, timeoutChecker)) { candidate.processMatch(explanation, sampleMessages, mappings, fieldStats, timeoutChecker); } else { @@ -189,7 +202,7 @@ public String createGrokPatternFromExamples(String seedPatternName, String seedF overallGrokPatternBuilder.setLength(0); - GrokPatternCandidate seedCandidate = new NoMappingGrokPatternCandidate(seedPatternName, seedFieldName); + GrokPatternCandidate seedCandidate = new NoMappingGrokPatternCandidate(seedPatternName, seedFieldName, grokPatternDefinitions); processCandidateAndSplit(seedCandidate, true, sampleMessages, false, 0, false, 0); @@ -215,8 +228,8 @@ private void processCandidateAndSplit(GrokPatternCandidate chosenPattern, boolea Collection prefaces = new ArrayList<>(); Collection epilogues = new ArrayList<>(); - String patternBuilderContent = - chosenPattern.processCaptures(fieldNameCountStore, snippets, prefaces, epilogues, mappings, fieldStats, timeoutChecker); + String patternBuilderContent = chosenPattern.processCaptures(explanation, fieldNameCountStore, snippets, prefaces, epilogues, + mappings, fieldStats, timeoutChecker); appendBestGrokMatchForStrings(false, prefaces, ignoreKeyValueCandidateLeft, ignoreValueOnlyCandidatesLeft); overallGrokPatternBuilder.append(patternBuilderContent); appendBestGrokMatchForStrings(isLast, epilogues, ignoreKeyValueCandidateRight, ignoreValueOnlyCandidatesRight); @@ -234,7 +247,7 @@ void appendBestGrokMatchForStrings(boolean isLast, Collection snippets, GrokPatternCandidate bestCandidate = null; if (snippets.isEmpty() == false) { - GrokPatternCandidate kvCandidate = new KeyValueGrokPatternCandidate(explanation); + GrokPatternCandidate kvCandidate = new KeyValueGrokPatternCandidate(); if (ignoreKeyValueCandidate == false && kvCandidate.matchesAll(snippets)) { bestCandidate = kvCandidate; } else { @@ -409,9 +422,9 @@ interface GrokPatternCandidate { * calculate field stats. * @return The string that needs to be incorporated into the overall Grok pattern for the line. */ - String processCaptures(Map fieldNameCountStore, Collection snippets, Collection prefaces, - Collection epilogues, Map mappings, Map fieldStats, - TimeoutChecker timeoutChecker); + String processCaptures(List explanation, Map fieldNameCountStore, Collection snippets, + Collection prefaces, Collection epilogues, Map mappings, + Map fieldStats, TimeoutChecker timeoutChecker); } /** @@ -434,10 +447,22 @@ static class ValueOnlyGrokPatternCandidate implements GrokPatternCandidate { * for the pre and/or post breaks. * * @param grokPatternName Name of the Grok pattern to try to match - must match one defined in Logstash. + * @param mappingType Data type for field in Elasticsearch mappings. * @param fieldName Name of the field to extract from the match. */ ValueOnlyGrokPatternCandidate(String grokPatternName, String mappingType, String fieldName) { - this(grokPatternName, mappingType, fieldName, "\\b", "\\b"); + this(grokPatternName, mappingType, fieldName, "\\b", "\\b", Grok.getBuiltinPatterns()); + } + + /** + * @param grokPatternName Name of the Grok pattern to try to match - must match one defined in Logstash. + * @param mappingType Data type for field in Elasticsearch mappings. + * @param fieldName Name of the field to extract from the match. + * @param grokPatternDefinitions Definitions of Grok patterns to be used. + */ + ValueOnlyGrokPatternCandidate(String grokPatternName, String mappingType, String fieldName, + Map grokPatternDefinitions) { + this(grokPatternName, mappingType, fieldName, "\\b", "\\b", grokPatternDefinitions); } /** @@ -448,11 +473,24 @@ static class ValueOnlyGrokPatternCandidate implements GrokPatternCandidate { * @param postBreak Only consider the match if it's broken from the following text by this. */ ValueOnlyGrokPatternCandidate(String grokPatternName, String mappingType, String fieldName, String preBreak, String postBreak) { + this(grokPatternName, mappingType, fieldName, preBreak, postBreak, Grok.getBuiltinPatterns()); + } + + /** + * @param grokPatternName Name of the Grok pattern to try to match - must match one defined in Logstash. + * @param mappingType Data type for field in Elasticsearch mappings. + * @param fieldName Name of the field to extract from the match. + * @param preBreak Only consider the match if it's broken from the previous text by this. + * @param postBreak Only consider the match if it's broken from the following text by this. + * @param grokPatternDefinitions Definitions of Grok patterns to be used. + */ + ValueOnlyGrokPatternCandidate(String grokPatternName, String mappingType, String fieldName, String preBreak, String postBreak, + Map grokPatternDefinitions) { this.grokPatternName = grokPatternName; this.mappingType = mappingType; this.fieldName = fieldName; // The (?m) here has the Ruby meaning, which is equivalent to (?s) in Java - grok = new Grok(Grok.getBuiltinPatterns(), "(?m)%{DATA:" + PREFACE + "}" + preBreak + + grok = new Grok(grokPatternDefinitions, "(?m)%{DATA:" + PREFACE + "}" + preBreak + "%{" + grokPatternName + ":" + VALUE + "}" + postBreak + "%{GREEDYDATA:" + EPILOGUE + "}", TimeoutChecker.watchdog); } @@ -467,9 +505,9 @@ public boolean matchesAll(Collection snippets) { * bit that matches. */ @Override - public String processCaptures(Map fieldNameCountStore, Collection snippets, Collection prefaces, - Collection epilogues, Map mappings, Map fieldStats, - TimeoutChecker timeoutChecker) { + public String processCaptures(List explanation, Map fieldNameCountStore, Collection snippets, + Collection prefaces, Collection epilogues, Map mappings, + Map fieldStats, TimeoutChecker timeoutChecker) { Collection values = new ArrayList<>(); for (String snippet : snippets) { Map captures = timeoutChecker.grokCaptures(grok, snippet, "full message Grok pattern field extraction"); @@ -485,10 +523,13 @@ public String processCaptures(Map fieldNameCountStore, Collecti if (mappings != null) { Map fullMappingType = Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, mappingType); if ("date".equals(mappingType)) { - assert values.isEmpty() == false; - TimestampMatch timestampMatch = TimestampFormatFinder.findFirstFullMatch(values.iterator().next(), timeoutChecker); - if (timestampMatch != null) { - fullMappingType = timestampMatch.getEsDateMappingTypeWithFormat(); + try { + fullMappingType = FileStructureUtils.findTimestampMapping(explanation, values, timeoutChecker); + } catch (IllegalArgumentException e) { + // This feels like it shouldn't happen, but there may be some obscure edge case + // where it does, and in production it will cause less frustration to just return + // a mapping type of "date" with no format than to fail the whole analysis + assert e == null : e.getMessage(); } timeoutChecker.check("mapping determination"); } @@ -509,13 +550,9 @@ public String processCaptures(Map fieldNameCountStore, Collecti */ static class KeyValueGrokPatternCandidate implements GrokPatternCandidate { - private static final Pattern kvFinder = Pattern.compile("\\b(\\w+)=[\\w.-]+"); - private final List explanation; - private String fieldName; + private static final Pattern KV_FINDER = Pattern.compile("\\b(\\w+)=[\\w.-]+"); - KeyValueGrokPatternCandidate(List explanation) { - this.explanation = explanation; - } + private String fieldName; @Override public boolean matchesAll(Collection snippets) { @@ -523,7 +560,7 @@ public boolean matchesAll(Collection snippets) { boolean isFirst = true; for (String snippet : snippets) { if (isFirst) { - Matcher matcher = kvFinder.matcher(snippet); + Matcher matcher = KV_FINDER.matcher(snippet); while (matcher.find()) { candidateNames.add(matcher.group(1)); } @@ -540,9 +577,9 @@ public boolean matchesAll(Collection snippets) { } @Override - public String processCaptures(Map fieldNameCountStore, Collection snippets, Collection prefaces, - Collection epilogues, Map mappings, Map fieldStats, - TimeoutChecker timeoutChecker) { + public String processCaptures(List explanation, Map fieldNameCountStore, Collection snippets, + Collection prefaces, Collection epilogues, Map mappings, + Map fieldStats, TimeoutChecker timeoutChecker) { if (fieldName == null) { throw new IllegalStateException("Cannot process KV matches until a field name has been determined"); } @@ -578,15 +615,15 @@ public String processCaptures(Map fieldNameCountStore, Collecti */ static class NoMappingGrokPatternCandidate extends ValueOnlyGrokPatternCandidate { - NoMappingGrokPatternCandidate(String grokPatternName, String fieldName) { - super(grokPatternName, null, fieldName); + NoMappingGrokPatternCandidate(String grokPatternName, String fieldName, Map grokPatternDefinitions) { + super(grokPatternName, null, fieldName, grokPatternDefinitions); } @Override - public String processCaptures(Map fieldNameCountStore, Collection snippets, Collection prefaces, - Collection epilogues, Map mappings, Map fieldStats, - TimeoutChecker timeoutChecker) { - return super.processCaptures(fieldNameCountStore, snippets, prefaces, epilogues, null, fieldStats, timeoutChecker); + public String processCaptures(List explanation, Map fieldNameCountStore, Collection snippets, + Collection prefaces, Collection epilogues, Map mappings, + Map fieldStats, TimeoutChecker timeoutChecker) { + return super.processCaptures(explanation, fieldNameCountStore, snippets, prefaces, epilogues, null, fieldStats, timeoutChecker); } } @@ -600,17 +637,27 @@ static class FullMatchGrokPatternCandidate { private final Grok grok; static FullMatchGrokPatternCandidate fromGrokPatternName(String grokPatternName, String timeField) { - return new FullMatchGrokPatternCandidate("%{" + grokPatternName + "}", timeField); + return new FullMatchGrokPatternCandidate("%{" + grokPatternName + "}", timeField, Grok.getBuiltinPatterns()); + } + + static FullMatchGrokPatternCandidate fromGrokPatternName(String grokPatternName, String timeField, + Map grokPatternDefinitions) { + return new FullMatchGrokPatternCandidate("%{" + grokPatternName + "}", timeField, grokPatternDefinitions); } static FullMatchGrokPatternCandidate fromGrokPattern(String grokPattern, String timeField) { - return new FullMatchGrokPatternCandidate(grokPattern, timeField); + return new FullMatchGrokPatternCandidate(grokPattern, timeField, Grok.getBuiltinPatterns()); + } + + static FullMatchGrokPatternCandidate fromGrokPattern(String grokPattern, String timeField, + Map grokPatternDefinitions) { + return new FullMatchGrokPatternCandidate(grokPattern, timeField, grokPatternDefinitions); } - private FullMatchGrokPatternCandidate(String grokPattern, String timeField) { + private FullMatchGrokPatternCandidate(String grokPattern, String timeField, Map grokPatternDefinitions) { this.grokPattern = grokPattern; this.timeField = timeField; - grok = new Grok(Grok.getBuiltinPatterns(), grokPattern, TimeoutChecker.watchdog); + grok = new Grok(grokPatternDefinitions, grokPattern, TimeoutChecker.watchdog); } public String getTimeField() { diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/NdJsonFileStructureFinder.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/NdJsonFileStructureFinder.java index 33d9ba56b3f53..116de8f7679d2 100644 --- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/NdJsonFileStructureFinder.java +++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/NdJsonFileStructureFinder.java @@ -11,7 +11,6 @@ import org.elasticsearch.common.xcontent.XContentParser; import org.elasticsearch.xpack.core.ml.filestructurefinder.FieldStats; import org.elasticsearch.xpack.core.ml.filestructurefinder.FileStructure; -import org.elasticsearch.xpack.ml.filestructurefinder.TimestampFormatFinder.TimestampMatch; import java.io.IOException; import java.util.ArrayList; @@ -53,17 +52,17 @@ static NdJsonFileStructureFinder makeNdJsonFileStructureFinder(List expl .setNumLinesAnalyzed(sampleMessages.size()) .setNumMessagesAnalyzed(sampleRecords.size()); - Tuple timeField = + Tuple timeField = FileStructureUtils.guessTimestampField(explanation, sampleRecords, overrides, timeoutChecker); if (timeField != null) { boolean needClientTimeZone = timeField.v2().hasTimezoneDependentParsing(); structureBuilder.setTimestampField(timeField.v1()) - .setJodaTimestampFormats(timeField.v2().jodaTimestampFormats) - .setJavaTimestampFormats(timeField.v2().javaTimestampFormats) + .setJodaTimestampFormats(timeField.v2().getJodaTimestampFormats()) + .setJavaTimestampFormats(timeField.v2().getJavaTimestampFormats()) .setNeedClientTimezone(needClientTimeZone) - .setIngestPipeline(FileStructureUtils.makeIngestPipelineDefinition(null, timeField.v1(), - timeField.v2().javaTimestampFormats, needClientTimeZone)); + .setIngestPipeline(FileStructureUtils.makeIngestPipelineDefinition(null, Collections.emptyMap(), timeField.v1(), + timeField.v2().getJavaTimestampFormats(), needClientTimeZone)); } Tuple, SortedMap> mappingsAndFieldStats = diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/TextLogFileStructureFinder.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/TextLogFileStructureFinder.java index 36e5e91b4326b..d07eea15f973f 100644 --- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/TextLogFileStructureFinder.java +++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/TextLogFileStructureFinder.java @@ -8,16 +8,12 @@ import org.elasticsearch.common.collect.Tuple; import org.elasticsearch.xpack.core.ml.filestructurefinder.FieldStats; import org.elasticsearch.xpack.core.ml.filestructurefinder.FileStructure; -import org.elasticsearch.xpack.ml.filestructurefinder.TimestampFormatFinder.TimestampMatch; import java.util.ArrayList; import java.util.Collection; import java.util.Collections; -import java.util.HashSet; -import java.util.LinkedHashMap; import java.util.List; import java.util.Map; -import java.util.Set; import java.util.SortedMap; import java.util.TreeMap; import java.util.regex.Pattern; @@ -30,25 +26,33 @@ public class TextLogFileStructureFinder implements FileStructureFinder { static TextLogFileStructureFinder makeTextLogFileStructureFinder(List explanation, String sample, String charsetName, Boolean hasByteOrderMarker, FileStructureOverrides overrides, TimeoutChecker timeoutChecker) { - String[] sampleLines = sample.split("\n"); - Tuple> bestTimestamp = mostLikelyTimestamp(sampleLines, overrides, timeoutChecker); - if (bestTimestamp == null) { - // Is it appropriate to treat a file that is neither structured nor has - // a regular pattern of timestamps as a log file? Probably not... - throw new IllegalArgumentException("Could not find " + - ((overrides.getTimestampFormat() == null) ? "a timestamp" : "the specified timestamp format") + " in the sample provided"); + TimestampFormatFinder timestampFormatFinder = populateTimestampFormatFinder(explanation, sampleLines, overrides, timeoutChecker); + switch (timestampFormatFinder.getNumMatchedFormats()) { + case 0: + // Is it appropriate to treat a file that is neither structured nor has + // a regular pattern of timestamps as a log file? Probably not... + throw new IllegalArgumentException("Could not find " + ((overrides.getTimestampFormat() == null) + ? "a timestamp" + : "the specified timestamp format") + " in the sample provided"); + case 1: + // Simple case + break; + default: + timestampFormatFinder.selectBestMatch(); + break; } - explanation.add(((overrides.getTimestampFormat() == null) ? "Most likely timestamp" : "Timestamp") + " format is [" + - bestTimestamp.v1() + "]"); + explanation.add(((overrides.getTimestampFormat() == null) ? "Most likely timestamp" : "Timestamp") + " format is " + + timestampFormatFinder.getJavaTimestampFormats()); List sampleMessages = new ArrayList<>(); StringBuilder preamble = new StringBuilder(); int linesConsumed = 0; StringBuilder message = null; int linesInMessage = 0; - String multiLineRegex = createMultiLineMessageStartRegex(bestTimestamp.v2(), bestTimestamp.v1().simplePattern.pattern()); + String multiLineRegex = createMultiLineMessageStartRegex(timestampFormatFinder.getPrefaces(), + timestampFormatFinder.getSimplePattern().pattern()); Pattern multiLinePattern = Pattern.compile(multiLineRegex); for (String sampleLine : sampleLines) { if (multiLinePattern.matcher(sampleLine).find()) { @@ -82,6 +86,9 @@ static TextLogFileStructureFinder makeTextLogFileStructureFinder(List ex + "problem is probably that the primary timestamp format has been incorrectly detected, so try overriding it."); } + // null to allow GC before Grok pattern search + sampleLines = null; + FileStructure.Builder structureBuilder = new FileStructure.Builder(FileStructure.Format.SEMI_STRUCTURED_TEXT) .setCharset(charsetName) .setHasByteOrderMarker(hasByteOrderMarker) @@ -97,7 +104,9 @@ static TextLogFileStructureFinder makeTextLogFileStructureFinder(List ex SortedMap fieldStats = new TreeMap<>(); fieldStats.put("message", FileStructureUtils.calculateFieldStats(sampleMessages, timeoutChecker)); - GrokPatternCreator grokPatternCreator = new GrokPatternCreator(explanation, sampleMessages, mappings, fieldStats, timeoutChecker); + Map customGrokPatternDefinitions = timestampFormatFinder.getCustomGrokPatternDefinitions(); + GrokPatternCreator grokPatternCreator = new GrokPatternCreator(explanation, sampleMessages, mappings, fieldStats, + customGrokPatternDefinitions, timeoutChecker); // We can't parse directly into @timestamp using Grok, so parse to some other time field, which the date filter will then remove String interimTimestampField = overrides.getTimestampField(); String grokPattern = overrides.getGrokPattern(); @@ -116,20 +125,22 @@ static TextLogFileStructureFinder makeTextLogFileStructureFinder(List ex if (interimTimestampField == null) { interimTimestampField = "timestamp"; } - grokPattern = grokPatternCreator.createGrokPatternFromExamples(bestTimestamp.v1().grokPatternName, interimTimestampField); + grokPattern = + grokPatternCreator.createGrokPatternFromExamples(timestampFormatFinder.getGrokPatternName(), interimTimestampField); } } - boolean needClientTimeZone = bestTimestamp.v1().hasTimezoneDependentParsing(); + boolean needClientTimeZone = timestampFormatFinder.hasTimezoneDependentParsing(); FileStructure structure = structureBuilder .setTimestampField(interimTimestampField) - .setJodaTimestampFormats(bestTimestamp.v1().jodaTimestampFormats) - .setJavaTimestampFormats(bestTimestamp.v1().javaTimestampFormats) + .setJodaTimestampFormats(timestampFormatFinder.getJodaTimestampFormats()) + .setJavaTimestampFormats(timestampFormatFinder.getJavaTimestampFormats()) .setNeedClientTimezone(needClientTimeZone) .setGrokPattern(grokPattern) - .setIngestPipeline(FileStructureUtils.makeIngestPipelineDefinition(grokPattern, interimTimestampField, - bestTimestamp.v1().javaTimestampFormats, needClientTimeZone)) + .setIngestPipeline(FileStructureUtils.makeIngestPipelineDefinition(grokPattern, + customGrokPatternDefinitions, interimTimestampField, + timestampFormatFinder.getJavaTimestampFormats(), needClientTimeZone)) .setMappings(mappings) .setFieldStats(fieldStats) .setExplanation(explanation) @@ -153,79 +164,23 @@ public FileStructure getStructure() { return structure; } - static Tuple> mostLikelyTimestamp(String[] sampleLines, FileStructureOverrides overrides, - TimeoutChecker timeoutChecker) { + static TimestampFormatFinder populateTimestampFormatFinder(List explanation, String[] sampleLines, + FileStructureOverrides overrides, TimeoutChecker timeoutChecker) { + TimestampFormatFinder timestampFormatFinder = + new TimestampFormatFinder(explanation, overrides.getTimestampFormat(), false, false, false, timeoutChecker); - Map>> timestampMatches = new LinkedHashMap<>(); - - int remainingLines = sampleLines.length; - double differenceBetweenTwoHighestWeights = 0.0; for (String sampleLine : sampleLines) { - TimestampMatch match = TimestampFormatFinder.findFirstMatch(sampleLine, overrides.getTimestampFormat(), timeoutChecker); - if (match != null) { - TimestampMatch pureMatch = new TimestampMatch(match.candidateIndex, "", match.jodaTimestampFormats, - match.javaTimestampFormats, match.simplePattern, match.grokPatternName, ""); - timestampMatches.compute(pureMatch, (k, v) -> { - if (v == null) { - return new Tuple<>(weightForMatch(match.preface), new HashSet<>(Collections.singletonList(match.preface))); - } else { - v.v2().add(match.preface); - return new Tuple<>(v.v1() + weightForMatch(match.preface), v.v2()); - } - }); - differenceBetweenTwoHighestWeights = findDifferenceBetweenTwoHighestWeights(timestampMatches.values()); - } - timeoutChecker.check("timestamp format determination"); - // The highest possible weight is 1, so if the difference between the two highest weights - // is less than the number of lines remaining then the leader cannot possibly be overtaken - if (differenceBetweenTwoHighestWeights > --remainingLines) { - break; - } + timestampFormatFinder.addSample(sampleLine); } - double highestWeight = 0.0; - Tuple> highestWeightMatch = null; - for (Map.Entry>> entry : timestampMatches.entrySet()) { - double weight = entry.getValue().v1(); - if (weight > highestWeight) { - highestWeight = weight; - highestWeightMatch = new Tuple<>(entry.getKey(), entry.getValue().v2()); - } - } - return highestWeightMatch; - } - - /** - * Used to weight a timestamp match according to how far along the line it is found. - * Timestamps at the very beginning of the line are given a weight of 1. The weight - * progressively decreases the more text there is preceding the timestamp match, but - * is always greater than 0. - * @return A weight in the range (0, 1]. - */ - private static double weightForMatch(String preface) { - return Math.pow(1.0 + preface.length() / 15.0, -1.1); - } - - private static double findDifferenceBetweenTwoHighestWeights(Collection>> timestampMatches) { - double highestWeight = 0.0; - double secondHighestWeight = 0.0; - for (Tuple> timestampMatch : timestampMatches) { - double weight = timestampMatch.v1(); - if (weight > highestWeight) { - secondHighestWeight = highestWeight; - highestWeight = weight; - } else if (weight > secondHighestWeight) { - secondHighestWeight = weight; - } - } - return highestWeight - secondHighestWeight; + return timestampFormatFinder; } - static String createMultiLineMessageStartRegex(Collection prefaces, String timestampRegex) { + static String createMultiLineMessageStartRegex(Collection prefaces, String simpleDateRegex) { StringBuilder builder = new StringBuilder("^"); GrokPatternCreator.addIntermediateRegex(builder, prefaces); - builder.append(timestampRegex); + builder.append(simpleDateRegex); if (builder.substring(0, 3).equals("^\\b")) { builder.delete(1, 3); } diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/TimestampFormatFinder.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/TimestampFormatFinder.java index c19a93a7be99e..0283437d64808 100644 --- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/TimestampFormatFinder.java +++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/TimestampFormatFinder.java @@ -5,56 +5,106 @@ */ package org.elasticsearch.xpack.ml.filestructurefinder; +import org.elasticsearch.common.Nullable; +import org.elasticsearch.common.SuppressForbidden; import org.elasticsearch.common.collect.Tuple; import org.elasticsearch.grok.Grok; +import java.time.DateTimeException; +import java.time.Instant; +import java.time.LocalDate; +import java.time.ZoneOffset; +import java.time.format.DateTimeFormatter; +import java.time.format.FormatStyle; +import java.time.format.ResolverStyle; +import java.time.temporal.ChronoField; +import java.time.temporal.TemporalAccessor; +import java.util.ArrayList; import java.util.Arrays; +import java.util.BitSet; import java.util.Collections; +import java.util.HashMap; import java.util.LinkedHashMap; +import java.util.LinkedHashSet; import java.util.List; +import java.util.Locale; import java.util.Map; import java.util.Objects; +import java.util.Set; +import java.util.function.Function; import java.util.regex.Matcher; import java.util.regex.Pattern; import java.util.stream.Collectors; -import java.util.stream.Stream; /** * Used to find the best timestamp format for one of the following situations: * 1. Matching an entire field value * 2. Matching a timestamp found somewhere within a message + * + * This class is not thread safe. Each object of this class should only be used from within a single thread. */ public final class TimestampFormatFinder { private static final String PREFACE = "preface"; private static final String EPILOGUE = "epilogue"; + private static final String PUNCTUATION_THAT_NEEDS_ESCAPING_IN_REGEX = "\\|()[]{}^$.*?"; private static final String FRACTIONAL_SECOND_SEPARATORS = ":.,"; - private static final Pattern FRACTIONAL_SECOND_INTERPRETER = Pattern.compile("([" + FRACTIONAL_SECOND_SEPARATORS + "])(\\d{3,9})"); - private static final char DEFAULT_FRACTIONAL_SECOND_SEPARATOR = ','; - private static final Pattern FRACTIONAL_SECOND_TIMESTAMP_FORMAT_PATTERN = - Pattern.compile("([" + FRACTIONAL_SECOND_SEPARATORS + "]S{3,9})"); - private static final String DEFAULT_FRACTIONAL_SECOND_FORMAT = DEFAULT_FRACTIONAL_SECOND_SEPARATOR + "SSS"; - - /** - * The timestamp patterns are complex and it can be slow to prove they do not - * match anywhere in a long message. Many of the timestamps are similar and - * will never be found in a string if simpler sub-patterns do not exist in the - * string. These sub-patterns can be used to quickly rule out multiple complex - * patterns. These patterns do not need to represent quantities that are - * useful to know the value of, merely character sequences that can be used to - * prove that several more complex patterns cannot possibly match. - */ - private static final List QUICK_RULE_OUT_PATTERNS = Arrays.asList( - // YYYY-MM-dd followed by a space - Pattern.compile("\\b\\d{4}-\\d{2}-\\d{2} "), - // The end of some number (likely year or day) followed by a space then HH:mm - Pattern.compile("\\d \\d{2}:\\d{2}\\b"), - // HH:mm:ss surrounded by spaces - Pattern.compile(" \\d{2}:\\d{2}:\\d{2} "), - // Literal 'T' surrounded by numbers - Pattern.compile("\\dT\\d") - ); + private static final char INDETERMINATE_FIELD_PLACEHOLDER = '?'; + // The ? characters in this must match INDETERMINATE_FIELD_PLACEHOLDER + // above, but they're literals in this regex to aid readability + private static final Pattern INDETERMINATE_FORMAT_INTERPRETER = Pattern.compile("([^?]*)(\\?{1,2})(?:([^?]*)(\\?{1,2})([^?]*))?"); + + /** + * These are the date format letter groups that are supported in custom formats + * + * (Note: Fractional seconds is a special case as they have to follow seconds.) + */ + private static final Map> VALID_LETTER_GROUPS; + static { + Map> validLetterGroups = new HashMap<>(); + validLetterGroups.put("yyyy", new Tuple<>("%{YEAR}", "\\d{4}")); + validLetterGroups.put("yy", new Tuple<>("%{YEAR}", "\\d{2}")); + validLetterGroups.put("M", new Tuple<>("%{MONTHNUM}", "\\d{1,2}")); + validLetterGroups.put("MM", new Tuple<>("%{MONTHNUM2}", "\\d{2}")); + // The simple regex here is based on the fact that the %{MONTH} Grok pattern only matches English and German month names + validLetterGroups.put("MMM", new Tuple<>("%{MONTH}", "[A-Z]\\S{2}")); + validLetterGroups.put("MMMM", new Tuple<>("%{MONTH}", "[A-Z]\\S{2,8}")); + validLetterGroups.put("d", new Tuple<>("%{MONTHDAY}", "\\d{1,2}")); + validLetterGroups.put("dd", new Tuple<>("%{MONTHDAY}", "\\d{2}")); + // The simple regex here is based on the fact that the %{DAY} Grok pattern only matches English and German day names + validLetterGroups.put("EEE", new Tuple<>("%{DAY}", "[A-Z]\\S{2}")); + validLetterGroups.put("EEEE", new Tuple<>("%{DAY}", "[A-Z]\\S{2,8}")); + validLetterGroups.put("H", new Tuple<>("%{HOUR}", "\\d{1,2}")); + validLetterGroups.put("HH", new Tuple<>("%{HOUR}", "\\d{2}")); + validLetterGroups.put("h", new Tuple<>("%{HOUR}", "\\d{1,2}")); + validLetterGroups.put("mm", new Tuple<>("%{MINUTE}", "\\d{2}")); + validLetterGroups.put("ss", new Tuple<>("%{SECOND}", "\\d{2}")); + validLetterGroups.put("a", new Tuple<>("(?:AM|PM)", "[AP]M")); + validLetterGroups.put("XX", new Tuple<>("%{ISO8601_TIMEZONE}", "(?:Z|[+-]\\d{4})")); + validLetterGroups.put("XXX", new Tuple<>("%{ISO8601_TIMEZONE}", "(?:Z|[+-]\\d{2}:\\d{2})")); + validLetterGroups.put("zzz", new Tuple<>("%{TZ}", "[A-Z]{3}")); + VALID_LETTER_GROUPS = Collections.unmodifiableMap(validLetterGroups); + } + + static final String CUSTOM_TIMESTAMP_GROK_NAME = "CUSTOM_TIMESTAMP"; + + /** + * Candidates for the special format strings (ISO8601, UNIX_MS, UNIX and TAI64N) + */ + static final CandidateTimestampFormat ISO8601_CANDIDATE_FORMAT = + new CandidateTimestampFormat(CandidateTimestampFormat::iso8601FormatFromExample, + "\\b\\d{4}-\\d{2}-\\d{2}[T ]\\d{2}:\\d{2}", "\\b%{TIMESTAMP_ISO8601}\\b", "TIMESTAMP_ISO8601", + "1111 11 11 11 11", 0, 19); + static final CandidateTimestampFormat UNIX_MS_CANDIDATE_FORMAT = + new CandidateTimestampFormat(example -> Collections.singletonList("UNIX_MS"), "\\b\\d{13}\\b", "\\b\\d{13}\\b", "POSINT", + "1111111111111", 0, 0); + static final CandidateTimestampFormat UNIX_CANDIDATE_FORMAT = + new CandidateTimestampFormat(example -> Collections.singletonList("UNIX"), "\\b\\d{10}\\b", "\\b\\d{10}(?:\\.\\d{3,9})?\\b", + "NUMBER", "1111111111", 0, 10); + static final CandidateTimestampFormat TAI64N_CANDIDATE_FORMAT = + new CandidateTimestampFormat(example -> Collections.singletonList("TAI64N"), "\\b[0-9A-Fa-f]{24}\\b", "\\b[0-9A-Fa-f]{24}\\b", + "BASE16NUM"); /** * The first match in this list will be chosen, so it needs to be ordered @@ -64,427 +114,1210 @@ public final class TimestampFormatFinder { // The TOMCAT_DATESTAMP format has to come before ISO8601 because it's basically ISO8601 but // with a space before the timezone, and because the timezone is optional in ISO8601 it will // be recognised as that with the timezone missed off if ISO8601 is checked first - new CandidateTimestampFormat("YYYY-MM-dd HH:mm:ss,SSS Z", "yyyy-MM-dd HH:mm:ss,SSS XX", - "\\b\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2},\\d{3}", + new CandidateTimestampFormat(example -> CandidateTimestampFormat.iso8601LikeFormatFromExample(example, " ", " "), + "\\b\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2}[:.,]\\d{3}", "\\b20\\d{2}-%{MONTHNUM}-%{MONTHDAY} %{HOUR}:?%{MINUTE}:(?:[0-5][0-9]|60)[:.,][0-9]{3,9} (?:Z|[+-]%{HOUR}%{MINUTE})\\b", - "TOMCAT_DATESTAMP", Arrays.asList(0, 1)), - // The Elasticsearch ISO8601 parser requires a literal T between the date and time, so - // longhand formats are needed if there's a space instead - new CandidateTimestampFormat("YYYY-MM-dd HH:mm:ss,SSSZ", "yyyy-MM-dd HH:mm:ss,SSSXX", - "\\b\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2},\\d{3}", - "\\b%{YEAR}-%{MONTHNUM}-%{MONTHDAY} %{HOUR}:?%{MINUTE}:(?:[0-5][0-9]|60)[:.,][0-9]{3,9}(?:Z|[+-]%{HOUR}%{MINUTE})\\b", - "TIMESTAMP_ISO8601", Arrays.asList(0, 1)), - new CandidateTimestampFormat("YYYY-MM-dd HH:mm:ss,SSSZZ", "yyyy-MM-dd HH:mm:ss,SSSXXX", - "\\b\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2},\\d{3}", - "\\b%{YEAR}-%{MONTHNUM}-%{MONTHDAY} %{HOUR}:?%{MINUTE}:(?:[0-5][0-9]|60)[:.,][0-9]{3,9}[+-]%{HOUR}:%{MINUTE}\\b", - "TIMESTAMP_ISO8601", Arrays.asList(0, 1)), - new CandidateTimestampFormat("YYYY-MM-dd HH:mm:ss,SSS", "yyyy-MM-dd HH:mm:ss,SSS", - "\\b\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2},\\d{3}", - "\\b%{YEAR}-%{MONTHNUM}-%{MONTHDAY} %{HOUR}:?%{MINUTE}:(?:[0-5][0-9]|60)[:.,][0-9]{3,9}\\b", "TIMESTAMP_ISO8601", - Arrays.asList(0, 1)), - new CandidateTimestampFormat("YYYY-MM-dd HH:mm:ssZ", "yyyy-MM-dd HH:mm:ssXX", "\\b\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2}", - "\\b%{YEAR}-%{MONTHNUM}-%{MONTHDAY} %{HOUR}:?%{MINUTE}:(?:[0-5][0-9]|60)(?:Z|[+-]%{HOUR}%{MINUTE})\\b", "TIMESTAMP_ISO8601", - Arrays.asList(0, 1)), - new CandidateTimestampFormat("YYYY-MM-dd HH:mm:ssZZ", "yyyy-MM-dd HH:mm:ssXXX", "\\b\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2}", - "\\b%{YEAR}-%{MONTHNUM}-%{MONTHDAY} %{HOUR}:?%{MINUTE}:(?:[0-5][0-9]|60)[+-]%{HOUR}:%{MINUTE}\\b", "TIMESTAMP_ISO8601", - Arrays.asList(0, 1)), - new CandidateTimestampFormat("YYYY-MM-dd HH:mm:ss", "yyyy-MM-dd HH:mm:ss", "\\b\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2}", - "\\b%{YEAR}-%{MONTHNUM}-%{MONTHDAY} %{HOUR}:?%{MINUTE}:(?:[0-5][0-9]|60)\\b", "TIMESTAMP_ISO8601", - Arrays.asList(0, 1)), - // When using Java time the Elasticsearch ISO8601 parser for fractional time requires that the fractional - // separator match the current JVM locale, which is too restrictive for arbitrary log file parsing - new CandidateTimestampFormat("ISO8601", "yyyy-MM-dd'T'HH:mm:ss,SSSXX", - "\\b\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2},\\d{3}", - "\\b%{YEAR}-%{MONTHNUM}-%{MONTHDAY}T%{HOUR}:?%{MINUTE}:(?:[0-5][0-9]|60)[:.,][0-9]{3,9}(?:Z|[+-]%{HOUR}%{MINUTE})\\b", - "TIMESTAMP_ISO8601", Collections.singletonList(3)), - new CandidateTimestampFormat("ISO8601", "yyyy-MM-dd'T'HH:mm:ss,SSSXXX", - "\\b\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2},\\d{3}", - "\\b%{YEAR}-%{MONTHNUM}-%{MONTHDAY}T%{HOUR}:?%{MINUTE}:(?:[0-5][0-9]|60)[:.,][0-9]{3,9}[+-]%{HOUR}:%{MINUTE}\\b", - "TIMESTAMP_ISO8601", Collections.singletonList(3)), - new CandidateTimestampFormat("ISO8601", "yyyy-MM-dd'T'HH:mm:ss,SSS", - "\\b\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2},\\d{3}", - "\\b%{YEAR}-%{MONTHNUM}-%{MONTHDAY}T%{HOUR}:?%{MINUTE}:(?:[0-5][0-9]|60)[:.,][0-9]{3,9}\\b", "TIMESTAMP_ISO8601", - Collections.singletonList(3)), - new CandidateTimestampFormat("ISO8601", "ISO8601", "\\b\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}", "\\b%{TIMESTAMP_ISO8601}\\b", - "TIMESTAMP_ISO8601", Collections.singletonList(3)), - new CandidateTimestampFormat("EEE MMM dd YYYY HH:mm:ss zzz", "EEE MMM dd yyyy HH:mm:ss zzz", - "\\b[A-Z]\\S{2,8} [A-Z]\\S{2,8} \\d{1,2} \\d{4} \\d{2}:\\d{2}:\\d{2} ", - "\\b%{DAY} %{MONTH} %{MONTHDAY} %{YEAR} %{HOUR}:%{MINUTE}:(?:[0-5][0-9]|60) %{TZ}\\b", "DATESTAMP_RFC822", Arrays.asList(1, 2)), - new CandidateTimestampFormat("EEE MMM dd YYYY HH:mm zzz", "EEE MMM dd yyyy HH:mm zzz", - "\\b[A-Z]\\S{2,8} [A-Z]\\S{2,8} \\d{1,2} \\d{4} \\d{2}:\\d{2} ", - "\\b%{DAY} %{MONTH} %{MONTHDAY} %{YEAR} %{HOUR}:%{MINUTE} %{TZ}\\b", "DATESTAMP_RFC822", Collections.singletonList(1)), - new CandidateTimestampFormat("EEE, dd MMM YYYY HH:mm:ss ZZ", "EEE, dd MMM yyyy HH:mm:ss XXX", - "\\b[A-Z]\\S{2,8}, \\d{1,2} [A-Z]\\S{2,8} \\d{4} \\d{2}:\\d{2}:\\d{2} ", - "\\b%{DAY}, %{MONTHDAY} %{MONTH} %{YEAR} %{HOUR}:%{MINUTE}:(?:[0-5][0-9]|60) (?:Z|[+-]%{HOUR}:%{MINUTE})\\b", - "DATESTAMP_RFC2822", Arrays.asList(1, 2)), - new CandidateTimestampFormat("EEE, dd MMM YYYY HH:mm:ss Z", "EEE, dd MMM yyyy HH:mm:ss XX", - "\\b[A-Z]\\S{2,8}, \\d{1,2} [A-Z]\\S{2,8} \\d{4} \\d{2}:\\d{2}:\\d{2} ", - "\\b%{DAY}, %{MONTHDAY} %{MONTH} %{YEAR} %{HOUR}:%{MINUTE}:(?:[0-5][0-9]|60) (?:Z|[+-]%{HOUR}%{MINUTE})\\b", - "DATESTAMP_RFC2822", Arrays.asList(1, 2)), - new CandidateTimestampFormat("EEE, dd MMM YYYY HH:mm ZZ", "EEE, dd MMM yyyy HH:mm XXX", - "\\b[A-Z]\\S{2,8}, \\d{1,2} [A-Z]\\S{2,8} \\d{4} \\d{2}:\\d{2} ", - "\\b%{DAY}, %{MONTHDAY} %{MONTH} %{YEAR} %{HOUR}:%{MINUTE} (?:Z|[+-]%{HOUR}:%{MINUTE})\\b", "DATESTAMP_RFC2822", - Collections.singletonList(1)), - new CandidateTimestampFormat("EEE, dd MMM YYYY HH:mm Z", "EEE, dd MMM yyyy HH:mm XX", - "\\b[A-Z]\\S{2,8}, \\d{1,2} [A-Z]\\S{2,8} \\d{4} \\d{2}:\\d{2} ", - "\\b%{DAY}, %{MONTHDAY} %{MONTH} %{YEAR} %{HOUR}:%{MINUTE} (?:Z|[+-]%{HOUR}%{MINUTE})\\b", "DATESTAMP_RFC2822", - Collections.singletonList(1)), - new CandidateTimestampFormat("EEE MMM dd HH:mm:ss zzz YYYY", "EEE MMM dd HH:mm:ss zzz yyyy", - "\\b[A-Z]\\S{2,8} [A-Z]\\S{2,8} \\d{1,2} \\d{2}:\\d{2}:\\d{2} [A-Z]{3,4} \\d{4}\\b", - "\\b%{DAY} %{MONTH} %{MONTHDAY} %{HOUR}:%{MINUTE}:(?:[0-5][0-9]|60) %{TZ} %{YEAR}\\b", "DATESTAMP_OTHER", - Arrays.asList(1, 2)), - new CandidateTimestampFormat("EEE MMM dd HH:mm zzz YYYY", "EEE MMM dd HH:mm zzz yyyy", - "\\b[A-Z]\\S{2,8} [A-Z]\\S{2,8} \\d{1,2} \\d{2}:\\d{2} [A-Z]{3,4} \\d{4}\\b", - "\\b%{DAY} %{MONTH} %{MONTHDAY} %{HOUR}:%{MINUTE} %{TZ} %{YEAR}\\b", "DATESTAMP_OTHER", Collections.singletonList(1)), - new CandidateTimestampFormat("YYYYMMddHHmmss", "yyyyMMddHHmmss", "\\b\\d{14}\\b", + "TOMCAT_DATESTAMP", "1111 11 11 11 11 11 111", 0, 13), + ISO8601_CANDIDATE_FORMAT, + new CandidateTimestampFormat( + example -> Arrays.asList("EEE MMM dd yy HH:mm:ss zzz", "EEE MMM d yy HH:mm:ss zzz"), + "\\b[A-Z]\\S{2} [A-Z]\\S{2} \\d{1,2} \\d{2} \\d{2}:\\d{2}:\\d{2}\\b", + "\\b%{DAY} %{MONTH} %{MONTHDAY} %{YEAR} %{HOUR}:%{MINUTE}(?::(?:[0-5][0-9]|60)) %{TZ}\\b", "DATESTAMP_RFC822", + Arrays.asList(" 11 11 11 11 11", " 1 11 11 11 11"), 0, 5), + new CandidateTimestampFormat( + example -> CandidateTimestampFormat.adjustTrailingTimezoneFromExample(example, "EEE, dd MMM yyyy HH:mm:ss XX"), + "\\b[A-Z]\\S{2}, \\d{1,2} [A-Z]\\S{2} \\d{4} \\d{2}:\\d{2}:\\d{2}\\b", + "\\b%{DAY}, %{MONTHDAY} %{MONTH} %{YEAR} %{HOUR}:%{MINUTE}(?::(?:[0-5][0-9]|60)) (?:Z|[+-]%{HOUR}:?%{MINUTE})\\b", + "DATESTAMP_RFC2822", Arrays.asList(" 11 1111 11 11 11", " 1 1111 11 11 11"), 0, 7), + new CandidateTimestampFormat( + example -> Arrays.asList("EEE MMM dd HH:mm:ss zzz yyyy", "EEE MMM d HH:mm:ss zzz yyyy"), + "\\b[A-Z]\\S{2,8} [A-Z]\\S{2,8} \\d{1,2} \\d{2}:\\d{2}:\\d{2}\\b", + "\\b%{DAY} %{MONTH} %{MONTHDAY} %{HOUR}:%{MINUTE}(?::(?:[0-5][0-9]|60)) %{TZ} %{YEAR}\\b", "DATESTAMP_OTHER", + Arrays.asList(" 11 11 11 11", " 1 11 11 11"), 12, 10), + new CandidateTimestampFormat(example -> Collections.singletonList("yyyyMMddHHmmss"), "\\b\\d{14}\\b", "\\b20\\d{2}%{MONTHNUM2}(?:(?:0[1-9])|(?:[12][0-9])|(?:3[01]))(?:2[0123]|[01][0-9])%{MINUTE}(?:[0-5][0-9]|60)\\b", - "DATESTAMP_EVENTLOG"), - new CandidateTimestampFormat("EEE MMM dd HH:mm:ss YYYY", "EEE MMM dd HH:mm:ss yyyy", - "\\b[A-Z]\\S{2,8} [A-Z]\\S{2,8} \\d{1,2} \\d{2}:\\d{2}:\\d{2} \\d{4}\\b", - "\\b%{DAY} %{MONTH} %{MONTHDAY} %{HOUR}:%{MINUTE}:(?:[0-5][0-9]|60) %{YEAR}\\b", "HTTPDERROR_DATE", Arrays.asList(1, 2)), - new CandidateTimestampFormat(Arrays.asList("MMM dd HH:mm:ss,SSS", "MMM d HH:mm:ss,SSS"), - Arrays.asList("MMM dd HH:mm:ss,SSS", "MMM d HH:mm:ss,SSS"), - "\\b[A-Z]\\S{2,8} {1,2}\\d{1,2} \\d{2}:\\d{2}:\\d{2},\\d{3}", - "%{MONTH} +%{MONTHDAY} %{HOUR}:%{MINUTE}:(?:[0-5][0-9]|60)[:.,][0-9]{3,9}\\b", "SYSLOGTIMESTAMP", - Collections.singletonList(1)), - new CandidateTimestampFormat(Arrays.asList("MMM dd HH:mm:ss", "MMM d HH:mm:ss"), - Arrays.asList("MMM dd HH:mm:ss", "MMM d HH:mm:ss"), - "\\b[A-Z]\\S{2,8} {1,2}\\d{1,2} \\d{2}:\\d{2}:\\d{2}\\b", "%{MONTH} +%{MONTHDAY} %{HOUR}:%{MINUTE}:(?:[0-5][0-9]|60)\\b", - "SYSLOGTIMESTAMP", Collections.singletonList(1)), - new CandidateTimestampFormat("dd/MMM/YYYY:HH:mm:ss Z", "dd/MMM/yyyy:HH:mm:ss XX", + "DATESTAMP_EVENTLOG", "11111111111111", 0, 0), + new CandidateTimestampFormat(example -> Collections.singletonList("EEE MMM dd HH:mm:ss yyyy"), + "\\b[A-Z]\\S{2} [A-Z]\\S{2} \\d{2} \\d{2}:\\d{2}:\\d{2} \\d{4}\\b", + "\\b%{DAY} %{MONTH} %{MONTHDAY} %{HOUR}:%{MINUTE}:(?:[0-5][0-9]|60) %{YEAR}\\b", "HTTPDERROR_DATE", + " 11 11 11 11 1111", 0, 0), + new CandidateTimestampFormat( + example -> CandidateTimestampFormat.expandDayAndAdjustFractionalSecondsFromExample(example, "MMM dd HH:mm:ss"), + "\\b[A-Z]\\S{2,8} {1,2}\\d{1,2} \\d{2}:\\d{2}:\\d{2}\\b", + "%{MONTH} +%{MONTHDAY} %{HOUR}:%{MINUTE}:(?:[0-5][0-9]|60)(?:[:.,][0-9]{3,9})?\\b", "SYSLOGTIMESTAMP", + Arrays.asList(" 11 11 11 11", " 1 11 11 11"), 4, 10), + new CandidateTimestampFormat(example -> Collections.singletonList("dd/MMM/yyyy:HH:mm:ss XX"), "\\b\\d{2}/[A-Z]\\S{2}/\\d{4}:\\d{2}:\\d{2}:\\d{2} ", - "\\b%{MONTHDAY}/%{MONTH}/%{YEAR}:%{HOUR}:%{MINUTE}:(?:[0-5][0-9]|60) [+-]?%{HOUR}%{MINUTE}\\b", "HTTPDATE"), - new CandidateTimestampFormat("MMM dd, YYYY h:mm:ss a", "MMM dd, yyyy h:mm:ss a", - "\\b[A-Z]\\S{2,8} \\d{1,2}, \\d{4} \\d{1,2}:\\d{2}:\\d{2} [AP]M\\b", - "%{MONTH} %{MONTHDAY}, 20\\d{2} %{HOUR}:%{MINUTE}:(?:[0-5][0-9]|60) (?:AM|PM)\\b", "CATALINA_DATESTAMP"), - new CandidateTimestampFormat(Arrays.asList("MMM dd YYYY HH:mm:ss", "MMM d YYYY HH:mm:ss"), - Arrays.asList("MMM dd yyyy HH:mm:ss", "MMM d yyyy HH:mm:ss"), - "\\b[A-Z]\\S{2,8} {1,2}\\d{1,2} \\d{4} \\d{2}:\\d{2}:\\d{2}\\b", - "%{MONTH} +%{MONTHDAY} %{YEAR} %{HOUR}:%{MINUTE}:(?:[0-5][0-9]|60)\\b", "CISCOTIMESTAMP", Collections.singletonList(1)), - new CandidateTimestampFormat("UNIX_MS", "UNIX_MS", "\\b\\d{13}\\b", "\\b\\d{13}\\b", "POSINT"), - new CandidateTimestampFormat("UNIX", "UNIX", "\\b\\d{10}\\.\\d{3,9}\\b", "\\b\\d{10}\\.(?:\\d{3}){1,3}\\b", "NUMBER"), - new CandidateTimestampFormat("UNIX", "UNIX", "\\b\\d{10}\\b", "\\b\\d{10}\\b", "POSINT"), - new CandidateTimestampFormat("TAI64N", "TAI64N", "\\b[0-9A-Fa-f]{24}\\b", "\\b[0-9A-Fa-f]{24}\\b", "BASE16NUM") + "\\b%{MONTHDAY}/%{MONTH}/%{YEAR}:%{HOUR}:%{MINUTE}:(?:[0-5][0-9]|60) [+-]?%{HOUR}%{MINUTE}\\b", "HTTPDATE", + "11 1111 11 11 11", 0, 6), + new CandidateTimestampFormat(example -> Collections.singletonList("MMM dd, yyyy h:mm:ss a"), + "\\b[A-Z]\\S{2} \\d{2}, \\d{4} \\d{1,2}:\\d{2}:\\d{2} [AP]M\\b", + "%{MONTH} %{MONTHDAY}, 20\\d{2} %{HOUR}:%{MINUTE}:(?:[0-5][0-9]|60) (?:AM|PM)\\b", "CATALINA_DATESTAMP", + Arrays.asList(" 11 1111 1 11 11", " 11 1111 11 11 11"), 0, 3), + new CandidateTimestampFormat(example -> Arrays.asList("MMM dd yyyy HH:mm:ss", "MMM d yyyy HH:mm:ss"), + "\\b[A-Z]\\S{2} {1,2}\\d{1,2} \\d{4} \\d{2}:\\d{2}:\\d{2}\\b", + "%{MONTH} +%{MONTHDAY} %{YEAR} %{HOUR}:%{MINUTE}:(?:[0-5][0-9]|60)\\b", "CISCOTIMESTAMP", + Arrays.asList(" 11 1111 11 11 11", " 1 1111 11 11 11"), 0, 0), + new CandidateTimestampFormat(CandidateTimestampFormat::indeterminateDayMonthFormatFromExample, + "\\b\\d{1,2}[/.-]\\d{1,2}[/.-]\\d{4}[- ]\\d{2}:\\d{2}:\\d{2}\\b", "\\b%{DATESTAMP}\\b", "DATESTAMP", + // In DATESTAMP the month may be 1 or 2 digits, but the day must be 2 + Arrays.asList("11 11 1111 11 11 11", "1 11 1111 11 11 11", "11 1 1111 11 11 11"), 0, 10), + new CandidateTimestampFormat(CandidateTimestampFormat::indeterminateDayMonthFormatFromExample, + "\\b\\d{1,2}[/.-]\\d{1,2}[/.-]\\d{4}\\b", "\\b%{DATE}\\b", "DATE", + // In DATE the month may be 1 or 2 digits, but the day must be 2 + Arrays.asList("11 11 1111", "11 1 1111", "1 11 1111"), 0, 0), + UNIX_MS_CANDIDATE_FORMAT, + UNIX_CANDIDATE_FORMAT, + TAI64N_CANDIDATE_FORMAT, + // This one is an ISO8601 date with no time, but the TIMESTAMP_ISO8601 Grok pattern doesn't cover it + new CandidateTimestampFormat(example -> Collections.singletonList("ISO8601"), + "\\b\\d{4}-\\d{2}-\\d{2}\\b", "\\b%{YEAR}-%{MONTHNUM2}-%{MONTHDAY}\\b", CUSTOM_TIMESTAMP_GROK_NAME, + "1111 11 11", 0, 0) ); - private TimestampFormatFinder() { + /** + * It is expected that the explanation will be shared with other code. + * Both this class and other classes will update it. + */ + private final List explanation; + private final boolean requireFullMatch; + private final boolean errorOnNoTimestamp; + private final boolean errorOnMultiplePatterns; + private final List orderedCandidateFormats; + private final TimeoutChecker timeoutChecker; + private final List matches; + // These two are not volatile because the class is explicitly not for use from multiple threads. + // But if it ever were to be made thread safe, making these volatile would be one required step. + private List matchedFormats; + private List cachedJavaTimestampFormats; + + /** + * Construct without any specific timestamp format override. + * @param explanation List of reasons for making decisions. May contain items when passed and new reasons + * can be appended by the methods of this class. + * @param requireFullMatch Must samples added to this object represent a timestamp in their entirety? + * @param errorOnNoTimestamp Should an exception be thrown if a sample is added that does not contain a recognised timestamp? + * @param errorOnMultiplePatterns Should an exception be thrown if samples are uploaded that require different Grok patterns? + * @param timeoutChecker Will abort the operation if its timeout is exceeded. + */ + public TimestampFormatFinder(List explanation, boolean requireFullMatch, boolean errorOnNoTimestamp, + boolean errorOnMultiplePatterns, TimeoutChecker timeoutChecker) { + this(explanation, null, requireFullMatch, errorOnNoTimestamp, errorOnMultiplePatterns, timeoutChecker); } /** - * Find the first timestamp format that matches part of the supplied value. - * @param text The value that the returned timestamp format must exist within. - * @param timeoutChecker Will abort the operation if its timeout is exceeded. - * @return The timestamp format, or null if none matches. + * Construct with a timestamp format override. + * @param explanation List of reasons for making decisions. May contain items when passed and new reasons + * can be appended by the methods of this class. + * @param overrideFormat A timestamp format that will take precedence when looking for timestamps. If null + * then the effect is to have no such override, i.e. equivalent to calling the other constructor. + * Timestamps will also be matched that have slightly different formats, but match the same Grok + * pattern as is implied by the override format. + * @param requireFullMatch Must samples added to this object represent a timestamp in their entirety? + * @param errorOnNoTimestamp Should an exception be thrown if a sample is added that does not contain a recognised timestamp? + * @param errorOnMultiplePatterns Should an exception be thrown if samples are uploaded that require different Grok patterns? + * @param timeoutChecker Will abort the operation if its timeout is exceeded. */ - public static TimestampMatch findFirstMatch(String text, TimeoutChecker timeoutChecker) { - return findFirstMatch(text, 0, timeoutChecker); + public TimestampFormatFinder(List explanation, @Nullable String overrideFormat, boolean requireFullMatch, + boolean errorOnNoTimestamp, boolean errorOnMultiplePatterns, TimeoutChecker timeoutChecker) { + this.explanation = Objects.requireNonNull(explanation); + this.requireFullMatch = requireFullMatch; + this.errorOnNoTimestamp = errorOnNoTimestamp; + this.errorOnMultiplePatterns = errorOnMultiplePatterns; + this.orderedCandidateFormats = (overrideFormat != null) + ? Collections.singletonList(makeCandidateFromOverrideFormat(overrideFormat, timeoutChecker)) + : ORDERED_CANDIDATE_FORMATS; + this.timeoutChecker = Objects.requireNonNull(timeoutChecker); + this.matches = new ArrayList<>(); + this.matchedFormats = new ArrayList<>(); } /** - * Find the first timestamp format that matches part of the supplied value. - * @param text The value that the returned timestamp format must exist within. - * @param requiredFormat A timestamp format that any returned match must support. - * @param timeoutChecker Will abort the operation if its timeout is exceeded. - * @return The timestamp format, or null if none matches. + * Convert a user supplied Java timestamp format to a Grok pattern and simple regular expression. + * @param overrideFormat A user supplied Java timestamp format. + * @return A tuple where the first value is a Grok pattern and the second is a simple regex. */ - public static TimestampMatch findFirstMatch(String text, String requiredFormat, TimeoutChecker timeoutChecker) { - return findFirstMatch(text, 0, requiredFormat, timeoutChecker); + static Tuple overrideFormatToGrokAndRegex(String overrideFormat) { + + if (overrideFormat.indexOf('\n') >= 0 || overrideFormat.indexOf('\r') >= 0) { + throw new IllegalArgumentException("Multi-line timestamp formats [" + overrideFormat + "] not supported"); + } + + if (overrideFormat.indexOf(INDETERMINATE_FIELD_PLACEHOLDER) >= 0) { + throw new IllegalArgumentException("Timestamp format [" + overrideFormat + "] not supported because it contains [" + + INDETERMINATE_FIELD_PLACEHOLDER + "]"); + } + + StringBuilder grokPatternBuilder = new StringBuilder(); + StringBuilder regexBuilder = new StringBuilder(); + + boolean notQuoted = true; + char prevChar = '\0'; + String prevLetterGroup = null; + int pos = 0; + while (pos < overrideFormat.length()) { + char curChar = overrideFormat.charAt(pos); + + if (curChar == '\'') { + notQuoted = !notQuoted; + } else if (notQuoted && Character.isLetter(curChar)) { + int startPos = pos; + int endPos = startPos + 1; + while (endPos < overrideFormat.length() && overrideFormat.charAt(endPos) == curChar) { + ++endPos; + ++pos; + } + String letterGroup = overrideFormat.substring(startPos, endPos); + Tuple grokPatternAndRegexForGroup = VALID_LETTER_GROUPS.get(letterGroup); + if (grokPatternAndRegexForGroup == null) { + // Special case of fractional seconds + if (curChar != 'S' || FRACTIONAL_SECOND_SEPARATORS.indexOf(prevChar) == -1 || + "ss".equals(prevLetterGroup) == false || endPos - startPos > 9) { + String msg = "Letter group [" + letterGroup + "] in [" + overrideFormat + "] is not supported"; + if (curChar == 'S') { + msg += " because it is not preceeded by [ss] and a separator from [" + FRACTIONAL_SECOND_SEPARATORS + "]"; + } + throw new IllegalArgumentException(msg); + } + // No need to append to the Grok pattern as %{SECOND} already allows for an optional + // fraction, but we need to remove the separator that's included in %{SECOND} + grokPatternBuilder.deleteCharAt(grokPatternBuilder.length() - 1); + regexBuilder.append("\\d{").append(endPos - startPos).append('}'); + } else { + grokPatternBuilder.append(grokPatternAndRegexForGroup.v1()); + if (regexBuilder.length() == 0) { + regexBuilder.append("\\b"); + } + regexBuilder.append(grokPatternAndRegexForGroup.v2()); + } + if (pos + 1 == overrideFormat.length()) { + regexBuilder.append("\\b"); + } + prevLetterGroup = letterGroup; + } else { + if (PUNCTUATION_THAT_NEEDS_ESCAPING_IN_REGEX.indexOf(curChar) >= 0) { + grokPatternBuilder.append('\\'); + regexBuilder.append('\\'); + } + grokPatternBuilder.append(curChar); + regexBuilder.append(curChar); + } + + prevChar = curChar; + ++pos; + } + + if (prevLetterGroup == null) { + throw new IllegalArgumentException("No time format letter groups in override format [" + overrideFormat + "]"); + } + + return new Tuple<>(grokPatternBuilder.toString(), regexBuilder.toString()); } /** - * Find the first timestamp format that matches part of the supplied value, - * excluding a specified number of candidate formats. - * @param text The value that the returned timestamp format must exist within. - * @param ignoreCandidates The number of candidate formats to exclude from the search. + * Given a user supplied Java timestamp format, return an appropriate candidate timestamp object as required by this class. + * The returned candidate might be a built-in one, or might be generated from the supplied format. + * @param overrideFormat A user supplied Java timestamp format. * @param timeoutChecker Will abort the operation if its timeout is exceeded. - * @return The timestamp format, or null if none matches. + * @return An appropriate candidate timestamp object. */ - public static TimestampMatch findFirstMatch(String text, int ignoreCandidates, TimeoutChecker timeoutChecker) { - return findFirstMatch(text, ignoreCandidates, null, timeoutChecker); + static CandidateTimestampFormat makeCandidateFromOverrideFormat(String overrideFormat, TimeoutChecker timeoutChecker) { + + // First check for a special format string + switch (overrideFormat.toUpperCase(Locale.ROOT)) { + case "ISO8601": + return ISO8601_CANDIDATE_FORMAT; + case "UNIX_MS": + return UNIX_MS_CANDIDATE_FORMAT; + case "UNIX": + return UNIX_CANDIDATE_FORMAT; + case "TAI64N": + return TAI64N_CANDIDATE_FORMAT; + } + + // Next check for a built-in candidate that incorporates the override, and prefer this + + // If the override is not a valid format then one or other of these two calls will + // throw, and that is how we'll report the invalid format to the user + Tuple grokPatternAndRegex = overrideFormatToGrokAndRegex(overrideFormat); + DateTimeFormatter javaTimeFormatter = DateTimeFormatter.ofPattern(overrideFormat, Locale.ROOT); + + // This timestamp (2001-02-03T04:05:06,123456789+0545) is chosen such that the month, day and hour all have just 1 digit. + // This means that it will distinguish between formats that do/don't output leading zeroes for month, day and hour. + // Additionally it has the full 9 digits of fractional second precision, to avoid the possibility of truncating the fraction. + String generatedTimestamp = javaTimeFormatter.withZone(ZoneOffset.ofHoursMinutesSeconds(5, 45, 0)) + .format(Instant.ofEpochMilli(981173106123L).plusNanos(456789L)); + for (CandidateTimestampFormat candidate : ORDERED_CANDIDATE_FORMATS) { + + TimestampMatch match = checkCandidate(candidate, generatedTimestamp, null, true, timeoutChecker); + if (match != null) { + return new CandidateTimestampFormat(example -> { + + // Modify the built-in candidate so it prefers to return the user supplied format + // if at all possible, and only falls back to standard logic for other situations + try { + // TODO consider support for overriding the locale too + // But since Grok only supports English and German date words ingest + // via Grok will fall down at an earlier stage for other languages... + javaTimeFormatter.parse(example); + return Collections.singletonList(overrideFormat); + } catch (DateTimeException e) { + return candidate.javaTimestampFormatSupplier.apply(example); + } + }, candidate.simplePattern.pattern(), candidate.strictGrokPattern, candidate.outputGrokPatternName); + } + } + + // None of the out-of-the-box formats were close, so use the built Grok pattern and simple regex for the override + return new CandidateTimestampFormat(example -> Collections.singletonList(overrideFormat), + grokPatternAndRegex.v2(), grokPatternAndRegex.v1(), CUSTOM_TIMESTAMP_GROK_NAME); } /** - * Find the first timestamp format that matches part of the supplied value, - * excluding a specified number of candidate formats. - * @param text The value that the returned timestamp format must exist within. - * @param ignoreCandidates The number of candidate formats to exclude from the search. - * @param requiredFormat A timestamp format that any returned match must support. - * @param timeoutChecker Will abort the operation if its timeout is exceeded. + * Find the first timestamp format that matches part or all of the supplied text. + * @param candidate The timestamp candidate to consider. + * @param text The text that the returned timestamp format must exist within. + * @param numberPosBitSet If not null, each bit must be set to true if and only if the + * corresponding position in {@code text} is a digit. + * @param requireFullMatch Does the candidate have to match the entire text? + * @param timeoutChecker Will abort the operation if its timeout is exceeded. * @return The timestamp format, or null if none matches. */ - public static TimestampMatch findFirstMatch(String text, int ignoreCandidates, String requiredFormat, TimeoutChecker timeoutChecker) { - if (ignoreCandidates >= ORDERED_CANDIDATE_FORMATS.size()) { - return null; - } - Boolean[] quickRuleoutMatches = new Boolean[QUICK_RULE_OUT_PATTERNS.size()]; - int index = ignoreCandidates; - String adjustedRequiredFormat = adjustRequiredFormat(requiredFormat); - for (CandidateTimestampFormat candidate : ORDERED_CANDIDATE_FORMATS.subList(ignoreCandidates, ORDERED_CANDIDATE_FORMATS.size())) { - if (adjustedRequiredFormat == null || candidate.jodaTimestampFormats.contains(adjustedRequiredFormat) || - candidate.javaTimestampFormats.contains(adjustedRequiredFormat)) { - boolean quicklyRuledOut = false; - for (Integer quickRuleOutIndex : candidate.quickRuleOutIndices) { - if (quickRuleoutMatches[quickRuleOutIndex] == null) { - quickRuleoutMatches[quickRuleOutIndex] = QUICK_RULE_OUT_PATTERNS.get(quickRuleOutIndex).matcher(text).find(); + private static TimestampMatch checkCandidate(CandidateTimestampFormat candidate, String text, @Nullable BitSet numberPosBitSet, + boolean requireFullMatch, TimeoutChecker timeoutChecker) { + if (requireFullMatch) { + Map captures = timeoutChecker.grokCaptures(candidate.strictFullMatchGrok, text, + "timestamp format determination"); + if (captures != null) { + return new TimestampMatch(candidate, "", text, ""); + } + } else { + // Since a search in a long string that has sections that nearly match will be very slow, it's + // worth doing an initial sanity check to see if the relative positions of digits necessary to + // get a match exist first + Tuple boundsForCandidate = findBoundsForCandidate(candidate, numberPosBitSet); + if (boundsForCandidate.v1() >= 0) { + assert boundsForCandidate.v2() > boundsForCandidate.v1(); + String matchIn = text.substring(boundsForCandidate.v1(), Math.min(boundsForCandidate.v2(), text.length())); + Map captures = timeoutChecker.grokCaptures(candidate.strictSearchGrok, matchIn, + "timestamp format determination"); + if (captures != null) { + StringBuilder prefaceBuilder = new StringBuilder(); + if (boundsForCandidate.v1() > 0) { + prefaceBuilder.append(text.subSequence(0, boundsForCandidate.v1())); + } + prefaceBuilder.append(captures.getOrDefault(PREFACE, "")); + StringBuilder epilogueBuilder = new StringBuilder(); + epilogueBuilder.append(captures.getOrDefault(EPILOGUE, "")); + if (boundsForCandidate.v2() < text.length()) { + epilogueBuilder.append(text.subSequence(boundsForCandidate.v2(), text.length())); } - if (quickRuleoutMatches[quickRuleOutIndex] == false) { - quicklyRuledOut = true; + return new TimestampMatch(candidate, prefaceBuilder.toString(), text.substring(prefaceBuilder.length(), + text.length() - epilogueBuilder.length()), epilogueBuilder.toString()); + } + } else { + timeoutChecker.check("timestamp format determination"); + } + } + + return null; + } + + /** + * Add a sample value to be considered by the format finder. If {@code requireFullMatch} was set to + * true on construction then the entire sample will be tested to see if it is a timestamp, + * otherwise a timestamp may be detected as just a portion of the sample. An exception will be thrown + * if {@code errorOnNoTimestamp} was set to true on construction, and no timestamp is + * found. An exception will also be thrown if {@code errorOnMultiplePatterns} was set to true + * on construction and a new timestamp format is detected that cannot be merged with a previously detected + * format. + * @param text The sample in which to detect a timestamp. + */ + public void addSample(String text) { + + BitSet numberPosBitSet = requireFullMatch ? null : stringToNumberPosBitSet(text); + + for (CandidateTimestampFormat candidate : orderedCandidateFormats) { + + TimestampMatch match = checkCandidate(candidate, text, numberPosBitSet, requireFullMatch, timeoutChecker); + if (match != null) { + TimestampFormat newFormat = match.timestampFormat; + boolean mustAdd = true; + for (int i = 0; i < matchedFormats.size(); ++i) { + TimestampFormat existingFormat = matchedFormats.get(i); + if (existingFormat.canMergeWith(newFormat)) { + matchedFormats.set(i, existingFormat.mergeWith(newFormat)); + mustAdd = false; + // Sharing formats considerably reduces the memory usage during the analysis + // when there are many samples, so reconstruct the match with a shared format + match = new TimestampMatch(match, matchedFormats.get(i)); break; } } - if (quicklyRuledOut == false) { - Map captures = timeoutChecker.grokCaptures(candidate.strictSearchGrok, text, - "timestamp format determination"); - if (captures != null) { - String preface = captures.getOrDefault(PREFACE, "").toString(); - String epilogue = captures.getOrDefault(EPILOGUE, "").toString(); - return makeTimestampMatch(candidate, index, preface, text.substring(preface.length(), - text.length() - epilogue.length()), epilogue); + if (mustAdd) { + if (errorOnMultiplePatterns && matchedFormats.isEmpty() == false) { + throw new IllegalArgumentException("Multiple timestamp formats found [" + + matchedFormats.get(0) + "] and [" + newFormat + "]"); } + matchedFormats.add(newFormat); } + + matches.add(match); + cachedJavaTimestampFormats = null; + return; } - ++index; } - return null; + + if (errorOnNoTimestamp) { + throw new IllegalArgumentException("No timestamp found in [" + text + "]"); + } } /** - * Find the best timestamp format for matching an entire field value. - * @param text The value that the returned timestamp format must match in its entirety. - * @param timeoutChecker Will abort the operation if its timeout is exceeded. - * @return The timestamp format, or null if none matches. + * Where multiple timestamp formats have been found, select the "best" one, whose details + * will then be returned by methods such as {@link #getGrokPatternName} and + * {@link #getJavaTimestampFormats}. If fewer than two timestamp formats have been found + * then this method does nothing. */ - public static TimestampMatch findFirstFullMatch(String text, TimeoutChecker timeoutChecker) { - return findFirstFullMatch(text, 0, timeoutChecker); + public void selectBestMatch() { + + if (matchedFormats.size() < 2) { + // Nothing to do + return; + } + + double[] weights = calculateMatchWeights(); + timeoutChecker.check("timestamp format determination"); + int highestWeightFormatIndex = findHighestWeightIndex(weights); + timeoutChecker.check("timestamp format determination"); + selectHighestWeightFormat(highestWeightFormatIndex); } /** - * Find the best timestamp format for matching an entire field value. - * @param text The value that the returned timestamp format must match in its entirety. - * @param requiredFormat A timestamp format that any returned match must support. - * @param timeoutChecker Will abort the operation if its timeout is exceeded. - * @return The timestamp format, or null if none matches. + * For each matched format, calculate a weight that can be used to decide which match is best. The + * weight for each matched format is the sum of the weights for all matches that have that format. + * @return An array of weights. There is one entry in the array for each entry in {@link #matchedFormats}, + * in the same order as the entries in {@link #matchedFormats}. */ - public static TimestampMatch findFirstFullMatch(String text, String requiredFormat, TimeoutChecker timeoutChecker) { - return findFirstFullMatch(text, 0, requiredFormat, timeoutChecker); + private double[] calculateMatchWeights() { + + int remainingMatches = matches.size(); + double[] weights = new double[matchedFormats.size()]; + for (TimestampMatch match : matches) { + + for (int matchedFormatIndex = 0; matchedFormatIndex < matchedFormats.size(); ++matchedFormatIndex) { + if (matchedFormats.get(matchedFormatIndex).canMergeWith(match.timestampFormat)) { + weights[matchedFormatIndex] += weightForMatch(match.preface); + break; + } + ++matchedFormatIndex; + } + + // The highest possible weight is 1, so if the difference between the two highest weights + // is less than the number of lines remaining then the leader cannot possibly be overtaken + if (findDifferenceBetweenTwoHighestWeights(weights) > --remainingMatches) { + break; + } + } + + return weights; } /** - * Find the best timestamp format for matching an entire field value, - * excluding a specified number of candidate formats. - * @param text The value that the returned timestamp format must match in its entirety. - * @param ignoreCandidates The number of candidate formats to exclude from the search. - * @param timeoutChecker Will abort the operation if its timeout is exceeded. - * @return The timestamp format, or null if none matches. + * Used to weight a timestamp match according to how far along the line it is found. + * Timestamps at the very beginning of the line are given a weight of 1. The weight + * progressively decreases the more text there is preceding the timestamp match, but + * is always greater than 0. + * @return A weight in the range (0, 1]. */ - public static TimestampMatch findFirstFullMatch(String text, int ignoreCandidates, TimeoutChecker timeoutChecker) { - return findFirstFullMatch(text, ignoreCandidates, null, timeoutChecker); + private static double weightForMatch(String preface) { + return Math.pow(1.0 + preface.length() / 15.0, -1.1); } /** - * Find the best timestamp format for matching an entire field value, - * excluding a specified number of candidate formats. - * @param text The value that the returned timestamp format must match in its entirety. - * @param ignoreCandidates The number of candidate formats to exclude from the search. - * @param requiredFormat A timestamp format that any returned match must support. - * @param timeoutChecker Will abort the operation if its timeout is exceeded. - * @return The timestamp format, or null if none matches. + * Given an array of weights, find the difference between the two highest values. + * @param weights Array of weights. Must have at least two elements. + * @return The difference between the two highest values. */ - public static TimestampMatch findFirstFullMatch(String text, int ignoreCandidates, String requiredFormat, - TimeoutChecker timeoutChecker) { - if (ignoreCandidates >= ORDERED_CANDIDATE_FORMATS.size()) { + private static double findDifferenceBetweenTwoHighestWeights(double[] weights) { + assert weights.length >= 2; + + double highestWeight = 0.0; + double secondHighestWeight = 0.0; + for (double weight : weights) { + if (weight > highestWeight) { + secondHighestWeight = highestWeight; + highestWeight = weight; + } else if (weight > secondHighestWeight) { + secondHighestWeight = weight; + } + } + return highestWeight - secondHighestWeight; + } + + /** + * Given an array of weights, find the index with the highest weight. + * @param weights Array of weights. + * @return The index of the element with the highest weight. + */ + private static int findHighestWeightIndex(double[] weights) { + + double highestWeight = Double.NEGATIVE_INFINITY; + int highestWeightFormatIndex = -1; + for (int index = 0; index < weights.length; ++index) { + double weight = weights[index]; + if (weight > highestWeight) { + highestWeight = weight; + highestWeightFormatIndex = index; + } + } + + return highestWeightFormatIndex; + } + + /** + * Ensure the highest weight matched format is at the beginning of the list of matched formats. + * @param highestWeightFormatIndex The index of the matched format with the highest weight. + */ + private void selectHighestWeightFormat(int highestWeightFormatIndex) { + + assert highestWeightFormatIndex >= 0; + // If the selected format is already at the beginning of the list there's nothing to do + if (highestWeightFormatIndex == 0) { + return; + } + + cachedJavaTimestampFormats = null; + List newMatchedFormats = new ArrayList<>(matchedFormats); + // Swap the selected format with the one that's currently at the beginning of the list + newMatchedFormats.set(0, matchedFormats.get(highestWeightFormatIndex)); + newMatchedFormats.set(highestWeightFormatIndex, matchedFormats.get(0)); + matchedFormats = newMatchedFormats; + } + + /** + * How many different timestamp formats have been matched in the supplied samples? + * @return The number of different timestamp formats that have been matched in the supplied samples. + */ + public int getNumMatchedFormats() { + return matchedFormats.size(); + } + + /** + * Get the Grok pattern name that corresponds to the selected timestamp format. + * @return The Grok pattern name that corresponds to the selected timestamp format. + */ + public String getGrokPatternName() { + if (matchedFormats.isEmpty()) { + // If errorOnNoTimestamp is set and we get here it means no samples have been added, which is likely a programmer mistake + assert errorOnNoTimestamp == false; return null; } - int index = ignoreCandidates; - String adjustedRequiredFormat = adjustRequiredFormat(requiredFormat); - for (CandidateTimestampFormat candidate : ORDERED_CANDIDATE_FORMATS.subList(ignoreCandidates, ORDERED_CANDIDATE_FORMATS.size())) { - if (adjustedRequiredFormat == null || candidate.jodaTimestampFormats.contains(adjustedRequiredFormat) || - candidate.javaTimestampFormats.contains(adjustedRequiredFormat)) { - Map captures = timeoutChecker.grokCaptures(candidate.strictFullMatchGrok, text, - "timestamp format determination"); - if (captures != null) { - return makeTimestampMatch(candidate, index, "", text, ""); + return matchedFormats.get(0).grokPatternName; + } + + /** + * Get the custom Grok pattern definitions derived from the override format, if any. + * @return The custom Grok pattern definitions for the selected timestamp format. + * If there are none an empty map is returned. + */ + public Map getCustomGrokPatternDefinitions() { + if (matchedFormats.isEmpty()) { + // If errorOnNoTimestamp is set and we get here it means no samples have been added, which is likely a programmer mistake + assert errorOnNoTimestamp == false; + return Collections.emptyMap(); + } + return matchedFormats.get(0).customGrokPatternDefinitions; + } + + /** + * Of all the samples added that correspond to the selected format, return + * the portion of the sample that comes before the timestamp. + * @return A list of prefaces from samples that match the selected timestamp format. + */ + public List getPrefaces() { + if (matchedFormats.isEmpty()) { + // If errorOnNoTimestamp is set and we get here it means no samples have been added, which is likely a programmer mistake + assert errorOnNoTimestamp == false; + return Collections.emptyList(); + } + return matches.stream().filter(match -> matchedFormats.size() < 2 || matchedFormats.get(0).canMergeWith(match.timestampFormat)) + .map(match -> match.preface).collect(Collectors.toList()); + } + + /** + * Get the simple regular expression that can be used to identify timestamps + * of the selected format in almost any programming language. + * @return A {@link Pattern} that will match timestamps of the selected format. + */ + public Pattern getSimplePattern() { + if (matchedFormats.isEmpty()) { + // If errorOnNoTimestamp is set and we get here it means no samples have been added, which is likely a programmer mistake + assert errorOnNoTimestamp == false; + return null; + } + return matchedFormats.get(0).simplePattern; + } + + /** + * These are similar to Java timestamp formats but may contain indeterminate day/month + * placeholders if the order of day and month is uncertain. + * @return A list of Java timestamp formats possibly containing indeterminate day/month placeholders. + */ + public List getRawJavaTimestampFormats() { + if (matchedFormats.isEmpty()) { + // If errorOnNoTimestamp is set and we get here it means no samples have been added, which is likely a programmer mistake + assert errorOnNoTimestamp == false; + return Collections.emptyList(); + } + return matchedFormats.get(0).rawJavaTimestampFormats; + } + + /** + * These are used by ingest pipeline and index mappings. + * @return A list of Java timestamp formats to use for parsing documents. + */ + public List getJavaTimestampFormats() { + if (cachedJavaTimestampFormats != null) { + return cachedJavaTimestampFormats; + } + return determiniseJavaTimestampFormats(getRawJavaTimestampFormats(), + // With multiple formats, only consider the matches that correspond to the first + // in the list (which is what we're returning information about via the getters). + // With just one format it's most efficient not to bother checking formats. + (matchedFormats.size() > 1) ? matchedFormats.get(0) : null); + } + + /** + * Given a list of timestamp formats that might contain indeterminate day/month parts, + * return the corresponding pattern with the placeholders replaced with concrete + * day/month formats. + */ + private List determiniseJavaTimestampFormats(List rawJavaTimestampFormats, + @Nullable TimestampFormat onlyConsiderFormat) { + + // This method needs rework if the class is ever made thread safe + + if (rawJavaTimestampFormats.stream().anyMatch(format -> format.indexOf(INDETERMINATE_FIELD_PLACEHOLDER) >= 0)) { + boolean isDayFirst = guessIsDayFirst(rawJavaTimestampFormats, onlyConsiderFormat, Locale.getDefault()); + cachedJavaTimestampFormats = rawJavaTimestampFormats.stream() + .map(format -> determiniseJavaTimestampFormat(format, isDayFirst)).collect(Collectors.toList()); + } else { + cachedJavaTimestampFormats = rawJavaTimestampFormats; + } + return cachedJavaTimestampFormats; + } + + /** + * If timestamp formats where the order of day and month could vary (as in a choice between dd/MM/yyyy + * or MM/dd/yyyy for example), make a guess about whether the day comes first. + * @return true if the day comes first and false if the month comes first. + */ + private boolean guessIsDayFirst(List rawJavaTimestampFormats, @Nullable TimestampFormat onlyConsiderFormat, + Locale localeForFallback) { + + Boolean isDayFirst = guessIsDayFirstFromFormats(rawJavaTimestampFormats); + if (isDayFirst != null) { + return isDayFirst; + } + isDayFirst = guessIsDayFirstFromMatches(onlyConsiderFormat); + if (isDayFirst != null) { + return isDayFirst; + } + return guessIsDayFirstFromLocale(localeForFallback); + } + + /** + * If timestamp formats where the order of day and month could vary (as in a choice between dd/MM/yyyy + * or MM/dd/yyyy for example), make a guess about whether the day comes first based on quirks of the + * built-in Grok patterns. + * @return true if the day comes first, false if the month comes first, and + * null if there is insufficient evidence to decide. + */ + Boolean guessIsDayFirstFromFormats(List rawJavaTimestampFormats) { + + Boolean isDayFirst = null; + + for (String rawJavaTimestampFormat : rawJavaTimestampFormats) { + Matcher matcher = INDETERMINATE_FORMAT_INTERPRETER.matcher(rawJavaTimestampFormat); + if (matcher.matches()) { + String firstNumber = matcher.group(2); + assert firstNumber != null; + String secondNumber = matcher.group(4); + if (secondNumber == null) { + return null; + } + if (firstNumber.length() == 2 && secondNumber.length() == 1) { + if (Boolean.FALSE.equals(isDayFirst)) { + // Inconsistency + return null; + } + isDayFirst = Boolean.TRUE; } + if (firstNumber.length() == 1 && secondNumber.length() == 2) { + if (Boolean.TRUE.equals(isDayFirst)) { + // Inconsistency + return null; + } + isDayFirst = Boolean.FALSE; + } + } + } + + if (isDayFirst != null) { + if (isDayFirst) { + explanation.add("Guessing day precedes month in timestamps as all detected formats have a two digits in the first number " + + "and a single digit in the second number which is what the %{MONTHDAY} and %{MONTHNUM} Grok patterns permit"); + } else { + explanation.add("Guessing month precedes day in timestamps as all detected formats have a single digit in the first number " + + "and two digits in the second number which is what the %{MONTHNUM} and %{MONTHDAY} Grok patterns permit"); } - ++index; } + + return isDayFirst; + } + + /** + * If timestamp formats where the order of day and month could vary (as in a choice between dd/MM/yyyy + * or MM/dd/yyyy for example), make a guess about whether the day comes first based on observed values + * of the first and second numbers. + * @return true if the day comes first, false if the month comes first, and + * null if there is insufficient evidence to decide. + */ + Boolean guessIsDayFirstFromMatches(@Nullable TimestampFormat onlyConsiderFormat) { + + BitSet firstIndeterminateNumbers = new BitSet(); + BitSet secondIndeterminateNumbers = new BitSet(); + + for (TimestampMatch match : matches) { + + if (onlyConsiderFormat == null || onlyConsiderFormat.canMergeWith(match.timestampFormat)) { + + // Valid indeterminate day/month numbers will be in the range 1 to 31. + // -1 is used to mean "not present", and we ignore that here. + + if (match.firstIndeterminateDateNumber > 0) { + assert match.firstIndeterminateDateNumber <= 31; + if (match.firstIndeterminateDateNumber > 12) { + explanation.add("Guessing day precedes month in timestamps as one sample had first number [" + + match.firstIndeterminateDateNumber + "]"); + return Boolean.TRUE; + } + firstIndeterminateNumbers.set(match.firstIndeterminateDateNumber); + } + if (match.secondIndeterminateDateNumber > 0) { + assert match.secondIndeterminateDateNumber <= 31; + if (match.secondIndeterminateDateNumber > 12) { + explanation.add("Guessing month precedes day in timestamps as one sample had second number [" + + match.secondIndeterminateDateNumber + "]"); + return Boolean.FALSE; + } + secondIndeterminateNumbers.set(match.secondIndeterminateDateNumber); + } + } + } + + // If there are many more values of one number than the other then assume that's the day + final int ratioForResult = 3; + int firstCardinality = firstIndeterminateNumbers.cardinality(); + int secondCardinality = secondIndeterminateNumbers.cardinality(); + if (secondCardinality == 0) { + // This happens in the following cases: + // - No indeterminate numbers (in which case the answer is irrelevant) + // - Only one indeterminate number (in which case we favour month over day) + return Boolean.FALSE; + } + // firstCardinality can be 0, but then secondCardinality should have been 0 too + assert firstCardinality > 0; + if (firstCardinality >= ratioForResult * secondCardinality) { + explanation.add("Guessing day precedes month in timestamps as there were [" + + firstCardinality + "] distinct values of the first number but only [" + secondCardinality + "] for the second"); + return Boolean.TRUE; + } + if (secondCardinality >= ratioForResult * firstCardinality) { + explanation.add("Guessing month precedes day in timestamps as there " + (firstCardinality == 1 ? "was" : "were") + " only [" + + firstCardinality + "] distinct " + (firstCardinality == 1 ? "value" : "values") + + " of the first number but [" + secondCardinality + "] for the second"); + return Boolean.FALSE; + } + return null; } /** - * If a required timestamp format contains a fractional seconds component, adjust it to the - * fractional seconds format that's in the candidate timestamp formats, i.e. ",SSS". So, for - * example, "YYYY-MM-dd HH:mm:ss.SSSSSSSSS Z" would get adjusted to "YYYY-MM-dd HH:mm:ss,SSS Z". + * If timestamp formats where the order of day and month could vary (as in a choice between dd/MM/yyyy + * or MM/dd/yyyy for example), make a guess about whether the day comes first based on the default order + * for a given locale. + * @return true if the day comes first and false if the month comes first. */ - static String adjustRequiredFormat(String requiredFormat) { + boolean guessIsDayFirstFromLocale(Locale locale) { - return (requiredFormat == null) ? null : - FRACTIONAL_SECOND_TIMESTAMP_FORMAT_PATTERN.matcher(requiredFormat).replaceFirst(DEFAULT_FRACTIONAL_SECOND_FORMAT); + // Fall back to whether the day comes before the month in the default short date format for the server locale. + // Can't use 1 as that occurs in 1970, so 3rd Feb is the earliest date that will reveal the server default. + String feb3rd1970 = makeShortLocalizedDateTimeFormatterForLocale(locale).format(LocalDate.ofEpochDay(33)); + if (feb3rd1970.indexOf('3') < feb3rd1970.indexOf('2')) { + explanation.add("Guessing day precedes month in timestamps based on server locale [" + + locale.getDisplayName(Locale.ROOT) + "]"); + return true; + } else { + explanation.add("Guessing month precedes day in timestamps based on server locale [" + + locale.getDisplayName(Locale.ROOT) + "]"); + return false; + } } - private static TimestampMatch makeTimestampMatch(CandidateTimestampFormat chosenTimestampFormat, int chosenIndex, - String preface, String matchedDate, String epilogue) { - Tuple fractionalSecondsInterpretation = interpretFractionalSeconds(matchedDate); - List jodaTimestampFormats = chosenTimestampFormat.jodaTimestampFormats; - List javaTimestampFormats = chosenTimestampFormat.javaTimestampFormats; - Pattern simplePattern = chosenTimestampFormat.simplePattern; - char separator = fractionalSecondsInterpretation.v1(); - if (separator != DEFAULT_FRACTIONAL_SECOND_SEPARATOR) { - jodaTimestampFormats = jodaTimestampFormats.stream() - .map(jodaTimestampFormat -> jodaTimestampFormat.replace(DEFAULT_FRACTIONAL_SECOND_SEPARATOR, separator)) - .collect(Collectors.toList()); - javaTimestampFormats = javaTimestampFormats.stream() - .map(javaTimestampFormat -> javaTimestampFormat.replace(DEFAULT_FRACTIONAL_SECOND_SEPARATOR, separator)) - .collect(Collectors.toList()); - if (jodaTimestampFormats.stream().noneMatch(jodaTimestampFormat -> jodaTimestampFormat.startsWith("UNIX"))) { - String patternStr = simplePattern.pattern(); - int separatorPos = patternStr.lastIndexOf(DEFAULT_FRACTIONAL_SECOND_SEPARATOR); - if (separatorPos >= 0) { - StringBuilder newPatternStr = new StringBuilder(patternStr); - newPatternStr.replace(separatorPos, separatorPos + 1, ((separator == '.') ? "\\" : "") + separator); - simplePattern = Pattern.compile(newPatternStr.toString()); + @SuppressForbidden(reason = "DateTimeFormatter.ofLocalizedDate() is forbidden because it uses the default locale, " + + "but here we are explicitly setting the locale on the formatter in a subsequent call") + private static DateTimeFormatter makeShortLocalizedDateTimeFormatterForLocale(Locale locale) { + return DateTimeFormatter.ofLocalizedDate(FormatStyle.SHORT).withLocale(locale).withZone(ZoneOffset.UTC); + } + + /** + * Given a raw timestamp format that might contain indeterminate day/month parts, + * return the corresponding pattern with the placeholders replaced with concrete + * day/month formats. + */ + static String determiniseJavaTimestampFormat(String rawJavaTimestampFormat, boolean isDayFirst) { + + Matcher matcher = INDETERMINATE_FORMAT_INTERPRETER.matcher(rawJavaTimestampFormat); + if (matcher.matches()) { + StringBuilder builder = new StringBuilder(); + for (int groupNum = 1; groupNum <= matcher.groupCount(); ++groupNum) { + switch (groupNum) { + case 2: { + char formatChar = isDayFirst ? 'd' : 'M'; + for (int count = matcher.group(groupNum).length(); count > 0; --count) { + builder.append(formatChar); + } + break; + } + case 4: { + char formatChar = isDayFirst ? 'M' : 'd'; + for (int count = matcher.group(groupNum).length(); count > 0; --count) { + builder.append(formatChar); + } + break; + } + default: + builder.append(matcher.group(groupNum)); + break; } } + return builder.toString(); + } else { + return rawJavaTimestampFormat; } - int numberOfDigitsInFractionalComponent = fractionalSecondsInterpretation.v2(); - if (numberOfDigitsInFractionalComponent > 3) { - String fractionalSecondsFormat = "SSSSSSSSS".substring(0, numberOfDigitsInFractionalComponent); - jodaTimestampFormats = jodaTimestampFormats.stream() - .map(jodaTimestampFormat -> jodaTimestampFormat.replace("SSS", fractionalSecondsFormat)) - .collect(Collectors.toList()); - javaTimestampFormats = javaTimestampFormats.stream() - .map(javaTimestampFormat -> javaTimestampFormat.replace("SSS", fractionalSecondsFormat)) - .collect(Collectors.toList()); + } + + /** + * These are still used by Logstash. + * @return A list of Joda timestamp formats that correspond to the detected Java timestamp formats. + */ + public List getJodaTimestampFormats() { + List javaTimestampFormats = getJavaTimestampFormats(); + return (javaTimestampFormats == null) ? null : javaTimestampFormats.stream() + .map(format -> format.replace("yy", "YY").replace("XXX", "ZZ").replace("XX", "Z")).collect(Collectors.toList()); + } + + /** + * Does the parsing the timestamp produce different results depending on the timezone of the parser? + * I.e., does the textual representation NOT define the timezone? + */ + public boolean hasTimezoneDependentParsing() { + if (matchedFormats.isEmpty()) { + // If errorOnNoTimestamp is set and we get here it means no samples have been added, which is likely a programmer mistake + assert errorOnNoTimestamp == false; + return false; } - return new TimestampMatch(chosenIndex, preface, jodaTimestampFormats, javaTimestampFormats, simplePattern, - chosenTimestampFormat.standardGrokPatternName, epilogue); + return matches.stream().filter(match -> matchedFormats.size() < 2 || matchedFormats.get(0).canMergeWith(match.timestampFormat)) + .anyMatch(match -> match.hasTimezoneDependentParsing); } /** - * Interpret the fractional seconds component of a date to determine two things: - * 1. The separator character - one of colon, comma and dot. - * 2. The number of digits in the fractional component. - * @param date The textual representation of the date for which fractional seconds are to be interpreted. - * @return A tuple of (fractional second separator character, number of digits in fractional component). + * Sometimes Elasticsearch mappings for dates need to include the format. + * This method returns appropriate mappings settings: at minimum "type" : "date", + * and possibly also a "format" setting. */ - static Tuple interpretFractionalSeconds(String date) { + public Map getEsDateMappingTypeWithFormat() { + List javaTimestampFormats = getJavaTimestampFormats(); + if (javaTimestampFormats.contains("TAI64N")) { + // There's no format for TAI64N in the timestamp formats used in mappings + return Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, "keyword"); + } + Map mapping = new LinkedHashMap<>(); + mapping.put(FileStructureUtils.MAPPING_TYPE_SETTING, "date"); + String formats = javaTimestampFormats.stream().map(format -> { + switch (format) { + case "ISO8601": + return "iso8601"; + case "UNIX_MS": + return "epoch_millis"; + case "UNIX": + return "epoch_second"; + default: + return format; + } + }).collect(Collectors.joining("||")); + if (formats.isEmpty() == false) { + mapping.put(FileStructureUtils.MAPPING_FORMAT_SETTING, formats); + } + return mapping; + } - Matcher matcher = FRACTIONAL_SECOND_INTERPRETER.matcher(date); - if (matcher.find()) { - return new Tuple<>(matcher.group(1).charAt(0), matcher.group(2).length()); + /** + * Given a timestamp candidate and a bit set showing the positions of digits in a piece of text, find the range + * of indices over which the candidate might possibly match. Searching for Grok patterns that nearly + * match but don't quite is very expensive, so this method allows only a substring of a long string to be + * searched using the full Grok pattern. + * @param candidate The timestamp candidate to consider. + * @param numberPosBitSet If not null, each bit must be set to true if and only if the + * corresponding position in the original text is a digit. + * @return A tuple of the form (start index, end index). If the timestamp candidate cannot possibly match + * anywhere then (-1, -1) is returned. The end index in the returned tuple may be beyond the end of the + * string (because the bit set is not necessarily the same length as the string so it cannot be capped + * by this method), so the caller must cap it before passing to {@link String#substring(int, int)}. + */ + static Tuple findBoundsForCandidate(CandidateTimestampFormat candidate, BitSet numberPosBitSet) { + + if (numberPosBitSet == null || candidate.quickRuleOutBitSets.isEmpty()) { + return new Tuple<>(0, Integer.MAX_VALUE); } - return new Tuple<>(DEFAULT_FRACTIONAL_SECOND_SEPARATOR, 0); + int minFirstMatchStart = -1; + int maxLastMatchEnd = -1; + for (BitSet quickRuleOutBitSet : candidate.quickRuleOutBitSets) { + int currentMatch = findBitPattern(numberPosBitSet, 0, quickRuleOutBitSet); + if (currentMatch >= 0) { + if (minFirstMatchStart == -1 || currentMatch < minFirstMatchStart) { + minFirstMatchStart = currentMatch; + } + do { + int currentMatchEnd = currentMatch + quickRuleOutBitSet.length(); + if (currentMatchEnd > maxLastMatchEnd) { + maxLastMatchEnd = currentMatchEnd; + } + currentMatch = findBitPattern(numberPosBitSet, currentMatch + 1, quickRuleOutBitSet); + } while (currentMatch > 0); + } + } + if (minFirstMatchStart == -1) { + assert maxLastMatchEnd == -1; + return new Tuple<>(-1, -1); + } + int lowerBound = Math.max(0, minFirstMatchStart - candidate.maxCharsBeforeQuickRuleOutMatch); + int upperBound = (Integer.MAX_VALUE - candidate.maxCharsAfterQuickRuleOutMatch - maxLastMatchEnd < 0) ? + Integer.MAX_VALUE : (maxLastMatchEnd + candidate.maxCharsAfterQuickRuleOutMatch); + return new Tuple<>(lowerBound, upperBound); } /** - * Represents a timestamp that has matched a field value or been found within a message. + * This is basically the "Shift-Add" algorithm for string matching from the paper "A New Approach to Text Searching". + * In this case the "alphabet" has just two "characters": 0 and 1 (or false and true in + * some places because of the {@link BitSet} interface). + * @see A New Approach to Text Searching + * @param findIn The binary string to search in; "text" in the terminology of the paper. + * @param beginIndex The index to start searching {@code findIn}. + * @param toFind The binary string to find; "pattern" in the terminology of the paper. + * @return The index (starting from 0) of the first match of {@code toFind} in {@code findIn}, or -1 if no match is found. */ - public static final class TimestampMatch { + static int findBitPattern(BitSet findIn, int beginIndex, BitSet toFind) { - /** - * The index of the corresponding entry in the ORDERED_CANDIDATE_FORMATS list. - */ - public final int candidateIndex; + assert beginIndex >= 0; - /** - * Text that came before the timestamp in the matched field/message. - */ - public final String preface; + // Note that this only compares up to the highest bit that is set, so trailing non digit characters will not participate + // in the comparison. This is not currently a problem for this class, but is something to consider if this functionality + // is ever reused elsewhere. The solution would be to use a wrapper class containing a BitSet and a separate int to store + // the length to compare. + int toFindLength = toFind.length(); + int findInLength = findIn.length(); + if (toFindLength == 0) { + return beginIndex; + } + // 63 here is the largest bit position (starting from 0) in a long + if (toFindLength > Math.min(63, findInLength)) { + // Since we control the input we should avoid the situation + // where the pattern to find has more bits than a single long + assert toFindLength <= 63 : "Length to find was [" + toFindLength + "] - cannot be greater than 63"; + return -1; + } + // ~1L means all bits set except the least significant + long state = ~1L; + // This array has one entry per "character" in the "alphabet" (which for this method consists of just 0 and 1) + // ~0L means all bits set + long[] toFindMask = { ~0L, ~0L }; + for (int i = 0; i < toFindLength; ++i) { + toFindMask[toFind.get(i) ? 1 : 0] &= ~(1L << i); + } + for (int i = beginIndex; i < findInLength; ++i) { + state |= toFindMask[findIn.get(i) ? 1 : 0]; + state <<= 1; + if ((state & (1L << toFindLength)) == 0L) { + return i - toFindLength + 1; + } + } - /** - * Time format specifier(s) that will work with Logstash and Ingest pipeline date parsers. - */ - public final List jodaTimestampFormats; + return -1; + } + + /** + * Converts a string into a {@link BitSet} with one bit per character of the string and bits + * set to 1 if the corresponding character in the string is a digit and 0 if not. (The first + * character of the string corresponds to the least significant bit in the {@link BitSet}, so + * if the {@link BitSet} is printed in natural order it will be reversed compared to the input, + * and then the most significant bit will be printed first. However, in terms of random access + * to individual characters/bits, this "reversal" is by far the most intuitive representation.) + * @param str The string to be mapped. + * @return A {@link BitSet} suitable for use as input to {@link #findBitPattern}. + */ + static BitSet stringToNumberPosBitSet(String str) { + + BitSet result = new BitSet(); + for (int index = 0; index < str.length(); ++index) { + if (Character.isDigit(str.charAt(index))) { + result.set(index); + } + } + return result; + } + + /** + * Represents an overall format matched within the supplied samples. + * Similar {@link TimestampFormat}s can be merged when they can be + * recognised by the same Grok pattern, simple regular expression, and + * punctuation in the preface, but have different Java timestamp formats. + * + * Objects are immutable. Merges that result in changes return new + * objects. + */ + static final class TimestampFormat { /** - * Time format specifier(s) that will work with Logstash and Ingest pipeline date parsers. + * Java time formats that may contain indeterminate day/month patterns. */ - public final List javaTimestampFormats; + final List rawJavaTimestampFormats; /** * A simple regex that will work in many languages to detect whether the timestamp format * exists in a particular line. */ - public final Pattern simplePattern; + final Pattern simplePattern; /** - * Name of an out-of-the-box Grok pattern that will match the timestamp. + * Name of a Grok pattern that will match the timestamp. */ - public final String grokPatternName; + final String grokPatternName; /** - * Text that came after the timestamp in the matched field/message. + * If {@link #grokPatternName} is not an out-of-the-box Grok pattern, then its definition. + */ + final Map customGrokPatternDefinitions; + + /** + * The punctuation characters in the text preceeding the timestamp in the samples. */ - public final String epilogue; + final String prefacePunctuation; + + TimestampFormat(List rawJavaTimestampFormats, Pattern simplePattern, String grokPatternName, + Map customGrokPatternDefinitions, String prefacePunctuation) { + this.rawJavaTimestampFormats = Collections.unmodifiableList(rawJavaTimestampFormats); + this.simplePattern = Objects.requireNonNull(simplePattern); + this.grokPatternName = Objects.requireNonNull(grokPatternName); + this.customGrokPatternDefinitions = Objects.requireNonNull(customGrokPatternDefinitions); + this.prefacePunctuation = prefacePunctuation; + } + + boolean canMergeWith(TimestampFormat other) { + + if (this == other) { + return true; + } - TimestampMatch(int candidateIndex, String preface, String jodaTimestampFormat, String javaTimestampFormat, String simpleRegex, - String grokPatternName, String epilogue) { - this(candidateIndex, preface, Collections.singletonList(jodaTimestampFormat), Collections.singletonList(javaTimestampFormat), - simpleRegex, grokPatternName, epilogue); + return other != null && + this.simplePattern.pattern().equals(other.simplePattern.pattern()) && + this.grokPatternName.equals(other.grokPatternName) && + Objects.equals(this.customGrokPatternDefinitions, other.customGrokPatternDefinitions) && + this.prefacePunctuation.equals(other.prefacePunctuation); } - TimestampMatch(int candidateIndex, String preface, List jodaTimestampFormats, List javaTimestampFormats, - String simpleRegex, String grokPatternName, String epilogue) { - this(candidateIndex, preface, jodaTimestampFormats, javaTimestampFormats, Pattern.compile(simpleRegex), grokPatternName, - epilogue); + TimestampFormat mergeWith(TimestampFormat other) { + + if (canMergeWith(other)) { + if (rawJavaTimestampFormats.equals(other.rawJavaTimestampFormats) == false) { + // Do the merge like this to preserve ordering + Set mergedJavaTimestampFormats = new LinkedHashSet<>(rawJavaTimestampFormats); + if (mergedJavaTimestampFormats.addAll(other.rawJavaTimestampFormats)) { + return new TimestampFormat(new ArrayList<>(mergedJavaTimestampFormats), simplePattern, grokPatternName, + customGrokPatternDefinitions, prefacePunctuation); + } + } + // The merged format is exactly the same as this format, so there's no need to create a new object + return this; + } + + throw new IllegalArgumentException("Cannot merge timestamp format [" + this + "] with [" + other + "]"); + } + + @Override + public int hashCode() { + return Objects.hash(rawJavaTimestampFormats, simplePattern.pattern(), grokPatternName, customGrokPatternDefinitions, + prefacePunctuation); + } + + @Override + public boolean equals(Object other) { + if (this == other) { + return true; + } + if (other == null || getClass() != other.getClass()) { + return false; + } + + TimestampFormat that = (TimestampFormat) other; + return Objects.equals(this.rawJavaTimestampFormats, that.rawJavaTimestampFormats) && + Objects.equals(this.simplePattern.pattern(), that.simplePattern.pattern()) && + Objects.equals(this.grokPatternName, that.grokPatternName) && + Objects.equals(this.customGrokPatternDefinitions, that.customGrokPatternDefinitions) && + Objects.equals(this.prefacePunctuation, that.prefacePunctuation); } - TimestampMatch(int candidateIndex, String preface, List jodaTimestampFormats, List javaTimestampFormats, - Pattern simplePattern, String grokPatternName, - String epilogue) { - this.candidateIndex = candidateIndex; - this.preface = preface; - this.jodaTimestampFormats = Collections.unmodifiableList(jodaTimestampFormats); - this.javaTimestampFormats = Collections.unmodifiableList(javaTimestampFormats); - this.simplePattern = simplePattern; - this.grokPatternName = grokPatternName; - this.epilogue = epilogue; + @Override + public String toString() { + return "Java timestamp formats = " + rawJavaTimestampFormats.stream().collect(Collectors.joining("', '", "[ '", "' ]")) + + ", simple pattern = '" + simplePattern.pattern() + "', grok pattern = '" + grokPatternName + "'" + + (customGrokPatternDefinitions.isEmpty() ? "" : ", custom grok pattern definitions = " + customGrokPatternDefinitions) + + ", preface punctuation = '" + prefacePunctuation + "'"; } + } + + /** + * Represents one match of a timestamp in one added sample. + */ + static final class TimestampMatch { + + // This picks out punctuation that is likely to represent a field separator. It deliberately + // leaves out punctuation that's most likely to vary between field values, such as dots. + private static final Pattern NON_PUNCTUATION_PATTERN = Pattern.compile("[^\\\\/|~:;,<>()\\[\\]{}«»\t]+"); + + // Used for deciding whether an ISO8601 timestamp contains a timezone. + private static final Pattern ISO8601_TIMEZONE_PATTERN = Pattern.compile("(Z|[+-]\\d{2}:?\\d{2})$"); /** - * Does the parsing the timestamp produce different results depending on the timezone of the parser? - * I.e., does the textual representation NOT define the timezone? + * Text that came before the timestamp in the matched field/message. */ - public boolean hasTimezoneDependentParsing() { - return javaTimestampFormats.stream().anyMatch(javaTimestampFormat -> - javaTimestampFormat.indexOf('X') == -1 && javaTimestampFormat.indexOf('z') == -1 && javaTimestampFormat.contains("mm")); - } + final String preface; /** - * Sometimes Elasticsearch mappings for dates need to include the format. - * This method returns appropriate mappings settings: at minimum "type"="date", - * and possibly also a "format" setting. + * Time format specifier(s) that will work with Logstash and Ingest pipeline date parsers. + */ + final TimestampFormat timestampFormat; + + /** + * These store the first and second numbers when the ordering of day and month is unclear, + * for example in 05/05/2019. Where the ordering is obvious they are set to -1. */ - public Map getEsDateMappingTypeWithFormat() { - if (javaTimestampFormats.contains("TAI64N")) { - // There's no format for TAI64N in the timestamp formats used in mappings - return Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, "keyword"); + final int firstIndeterminateDateNumber; + final int secondIndeterminateDateNumber; + + final boolean hasTimezoneDependentParsing; + + /** + * Text that came after the timestamp in the matched field/message. + */ + final String epilogue; + + TimestampMatch(CandidateTimestampFormat chosenTimestampFormat, String preface, String matchedDate, String epilogue) { + this.preface = Objects.requireNonNull(preface); + this.timestampFormat = new TimestampFormat(chosenTimestampFormat.javaTimestampFormatSupplier.apply(matchedDate), + chosenTimestampFormat.simplePattern, chosenTimestampFormat.outputGrokPatternName, + chosenTimestampFormat.customGrokPatternDefinitions(), + preface.isEmpty() ? preface : NON_PUNCTUATION_PATTERN.matcher(preface).replaceAll("")); + int[] indeterminateDateNumbers = parseIndeterminateDateNumbers(matchedDate, timestampFormat.rawJavaTimestampFormats); + this.firstIndeterminateDateNumber = indeterminateDateNumbers[0]; + this.secondIndeterminateDateNumber = indeterminateDateNumbers[1]; + this.hasTimezoneDependentParsing = requiresTimezoneDependentParsing(timestampFormat.rawJavaTimestampFormats.get(0), + matchedDate); + this.epilogue = Objects.requireNonNull(epilogue); + } + + TimestampMatch(TimestampMatch toCopyExceptFormat, TimestampFormat timestampFormat) { + this.preface = toCopyExceptFormat.preface; + this.timestampFormat = Objects.requireNonNull(timestampFormat); + this.firstIndeterminateDateNumber = toCopyExceptFormat.firstIndeterminateDateNumber; + this.secondIndeterminateDateNumber = toCopyExceptFormat.secondIndeterminateDateNumber; + this.hasTimezoneDependentParsing = toCopyExceptFormat.hasTimezoneDependentParsing; + this.epilogue = toCopyExceptFormat.epilogue; + } + + static boolean requiresTimezoneDependentParsing(String format, String matchedDate) { + switch (format) { + case "ISO8601": + assert matchedDate.length() > 6; + return ISO8601_TIMEZONE_PATTERN.matcher(matchedDate).find(matchedDate.length() - 6) == false; + case "UNIX_MS": + case "UNIX": + case "TAI64N": + return false; + default: + boolean notQuoted = true; + for (int pos = 0; pos < format.length(); ++pos) { + char curChar = format.charAt(pos); + if (curChar == '\'') { + notQuoted = !notQuoted; + } else if (notQuoted && (curChar == 'X' || curChar == 'z')) { + return false; + } + } + return true; } - Map mapping = new LinkedHashMap<>(); - mapping.put(FileStructureUtils.MAPPING_TYPE_SETTING, "date"); - String formats = javaTimestampFormats.stream().flatMap(format -> { - switch (format) { - case "ISO8601": - return Stream.empty(); - case "UNIX_MS": - return Stream.of("epoch_millis"); - case "UNIX": - return Stream.of("epoch_second"); - default: - return Stream.of(format); + } + + static int[] parseIndeterminateDateNumbers(String matchedDate, List rawJavaTimestampFormats) { + int[] indeterminateDateNumbers = { -1, -1 }; + + for (String rawJavaTimestampFormat : rawJavaTimestampFormats) { + + if (rawJavaTimestampFormat.indexOf(INDETERMINATE_FIELD_PLACEHOLDER) >= 0) { + + try { + // Parse leniently under the assumption the first sequence of hashes is day and the + // second is month - this may not be true but all we do is extract the numbers + String javaTimestampFormat = determiniseJavaTimestampFormat(rawJavaTimestampFormat, true); + + // TODO consider support for overriding the locale too + // But it's not clear-cut as Grok only knows English and German date + // words and for indeterminate formats we're expecting numbers anyway + DateTimeFormatter javaTimeFormatter = DateTimeFormatter.ofPattern(javaTimestampFormat, Locale.ROOT) + .withResolverStyle(ResolverStyle.LENIENT); + TemporalAccessor accessor = javaTimeFormatter.parse(matchedDate); + indeterminateDateNumbers[0] = accessor.get(ChronoField.DAY_OF_MONTH); + + // Now parse again leniently under the assumption the first sequence of hashes is month and the + // second is day - we have to do it twice and extract day as the lenient parser will wrap months > 12 + javaTimestampFormat = determiniseJavaTimestampFormat(rawJavaTimestampFormat, false); + + // TODO consider support for overriding the locale too + // But it's not clear-cut as Grok only knows English and German date + // words and for indeterminate formats we're expecting numbers anyway + javaTimeFormatter = DateTimeFormatter.ofPattern(javaTimestampFormat, Locale.ROOT) + .withResolverStyle(ResolverStyle.LENIENT); + accessor = javaTimeFormatter.parse(matchedDate); + indeterminateDateNumbers[1] = accessor.get(ChronoField.DAY_OF_MONTH); + if (indeterminateDateNumbers[0] > 0 && indeterminateDateNumbers[1] > 0) { + break; + } + } catch (DateTimeException e) { + // Move on to the next format + } } - }).collect(Collectors.joining("||")); - if (formats.isEmpty() == false) { - mapping.put(FileStructureUtils.MAPPING_FORMAT_SETTING, formats); } - return mapping; + + return indeterminateDateNumbers; } @Override public int hashCode() { - return Objects.hash(candidateIndex, preface, jodaTimestampFormats, javaTimestampFormats, simplePattern.pattern(), - grokPatternName, epilogue); + return Objects.hash(preface, timestampFormat, firstIndeterminateDateNumber, secondIndeterminateDateNumber, + hasTimezoneDependentParsing, epilogue); } @Override @@ -497,66 +1330,186 @@ public boolean equals(Object other) { } TimestampMatch that = (TimestampMatch) other; - return this.candidateIndex == that.candidateIndex && - Objects.equals(this.preface, that.preface) && - Objects.equals(this.jodaTimestampFormats, that.jodaTimestampFormats) && - Objects.equals(this.javaTimestampFormats, that.javaTimestampFormats) && - Objects.equals(this.simplePattern.pattern(), that.simplePattern.pattern()) && - Objects.equals(this.grokPatternName, that.grokPatternName) && + return Objects.equals(this.preface, that.preface) && + Objects.equals(this.timestampFormat, that.timestampFormat) && + this.firstIndeterminateDateNumber == that.firstIndeterminateDateNumber && + this.secondIndeterminateDateNumber == that.secondIndeterminateDateNumber && + this.hasTimezoneDependentParsing == that.hasTimezoneDependentParsing && Objects.equals(this.epilogue, that.epilogue); } @Override public String toString() { - return "index = " + candidateIndex + (preface.isEmpty() ? "" : ", preface = '" + preface + "'") + - ", Joda timestamp formats = " + jodaTimestampFormats.stream().collect(Collectors.joining("', '", "[ '", "' ]")) + - ", Java timestamp formats = " + javaTimestampFormats.stream().collect(Collectors.joining("', '", "[ '", "' ]")) + - ", simple pattern = '" + simplePattern.pattern() + "', grok pattern = '" + grokPatternName + "'" + - (epilogue.isEmpty() ? "" : ", epilogue = '" + epilogue + "'"); + return (preface.isEmpty() ? "" : "preface = '" + preface + "', ") + timestampFormat + + ((firstIndeterminateDateNumber > 0 || secondIndeterminateDateNumber > 0) + ? ", indeterminate date numbers = (" + firstIndeterminateDateNumber + "," + secondIndeterminateDateNumber + ")" + : "") + + ", has timezone-dependent parsing = " + hasTimezoneDependentParsing + + (epilogue.isEmpty() ? "" : ", epilogue = '" + epilogue + "'"); } } + /** + * Stores the details of a possible timestamp format to consider when looking for timestamps. + */ static final class CandidateTimestampFormat { - final List jodaTimestampFormats; - final List javaTimestampFormats; + private static final Pattern FRACTIONAL_SECOND_INTERPRETER = Pattern.compile("([" + FRACTIONAL_SECOND_SEPARATORS + "])(\\d{3,9})$"); + // This means that in the case of a literal Z, XXX is preferred + private static final Pattern TRAILING_OFFSET_WITHOUT_COLON_FINDER = Pattern.compile("[+-]\\d{4}$"); + + final Function> javaTimestampFormatSupplier; final Pattern simplePattern; + final String strictGrokPattern; final Grok strictSearchGrok; final Grok strictFullMatchGrok; - final String standardGrokPatternName; - final List quickRuleOutIndices; - - CandidateTimestampFormat(String jodaTimestampFormat, String javaTimestampFormat, String simpleRegex, String strictGrokPattern, - String standardGrokPatternName) { - this(Collections.singletonList(jodaTimestampFormat), Collections.singletonList(javaTimestampFormat), simpleRegex, - strictGrokPattern, standardGrokPatternName); - } + final String outputGrokPatternName; + final List quickRuleOutBitSets; + final int maxCharsBeforeQuickRuleOutMatch; + final int maxCharsAfterQuickRuleOutMatch; - CandidateTimestampFormat(String jodaTimestampFormat, String javaTimestampFormat, String simpleRegex, String strictGrokPattern, - String standardGrokPatternName, List quickRuleOutIndices) { - this(Collections.singletonList(jodaTimestampFormat), Collections.singletonList(javaTimestampFormat), simpleRegex, - strictGrokPattern, standardGrokPatternName, quickRuleOutIndices); + CandidateTimestampFormat(Function> javaTimestampFormatSupplier, String simpleRegex, String strictGrokPattern, + String outputGrokPatternName) { + this(javaTimestampFormatSupplier, simpleRegex, strictGrokPattern, outputGrokPatternName, Collections.emptyList(), + Integer.MAX_VALUE, Integer.MAX_VALUE); } - CandidateTimestampFormat(List jodaTimestampFormats, List javaTimestampFormats, String simpleRegex, - String strictGrokPattern, String standardGrokPatternName) { - this(jodaTimestampFormats, javaTimestampFormats, simpleRegex, strictGrokPattern, standardGrokPatternName, - Collections.emptyList()); + CandidateTimestampFormat(Function> javaTimestampFormatSupplier, String simpleRegex, String strictGrokPattern, + String outputGrokPatternName, String quickRuleOutPattern, int maxCharsBeforeQuickRuleOutMatch, + int maxCharsAfterQuickRuleOutMatch) { + this(javaTimestampFormatSupplier, simpleRegex, strictGrokPattern, outputGrokPatternName, + Collections.singletonList(quickRuleOutPattern), maxCharsBeforeQuickRuleOutMatch, maxCharsAfterQuickRuleOutMatch); } - CandidateTimestampFormat(List jodaTimestampFormats, List javaTimestampFormats, String simpleRegex, - String strictGrokPattern, String standardGrokPatternName, List quickRuleOutIndices) { - this.jodaTimestampFormats = jodaTimestampFormats; - this.javaTimestampFormats = javaTimestampFormats; + CandidateTimestampFormat(Function> javaTimestampFormatSupplier, String simpleRegex, String strictGrokPattern, + String outputGrokPatternName, List quickRuleOutPatterns, int maxCharsBeforeQuickRuleOutMatch, + int maxCharsAfterQuickRuleOutMatch) { + this.javaTimestampFormatSupplier = Objects.requireNonNull(javaTimestampFormatSupplier); this.simplePattern = Pattern.compile(simpleRegex, Pattern.MULTILINE); + this.strictGrokPattern = Objects.requireNonNull(strictGrokPattern); // The (?m) here has the Ruby meaning, which is equivalent to (?s) in Java this.strictSearchGrok = new Grok(Grok.getBuiltinPatterns(), "(?m)%{DATA:" + PREFACE + "}" + strictGrokPattern + "%{GREEDYDATA:" + EPILOGUE + "}", TimeoutChecker.watchdog); this.strictFullMatchGrok = new Grok(Grok.getBuiltinPatterns(), "^" + strictGrokPattern + "$", TimeoutChecker.watchdog); - this.standardGrokPatternName = standardGrokPatternName; - assert quickRuleOutIndices.stream() - .noneMatch(quickRuleOutIndex -> quickRuleOutIndex < 0 || quickRuleOutIndex >= QUICK_RULE_OUT_PATTERNS.size()); - this.quickRuleOutIndices = quickRuleOutIndices; + this.outputGrokPatternName = Objects.requireNonNull(outputGrokPatternName); + this.quickRuleOutBitSets = quickRuleOutPatterns.stream().map(TimestampFormatFinder::stringToNumberPosBitSet) + .collect(Collectors.toList()); + assert maxCharsBeforeQuickRuleOutMatch >= 0; + this.maxCharsBeforeQuickRuleOutMatch = maxCharsBeforeQuickRuleOutMatch; + assert maxCharsAfterQuickRuleOutMatch >= 0; + this.maxCharsAfterQuickRuleOutMatch = maxCharsAfterQuickRuleOutMatch; + } + + Map customGrokPatternDefinitions() { + return CUSTOM_TIMESTAMP_GROK_NAME.equals(outputGrokPatternName) + ? Collections.singletonMap(CUSTOM_TIMESTAMP_GROK_NAME, strictGrokPattern) + : Collections.emptyMap(); + } + + static List iso8601FormatFromExample(String example) { + + // The Elasticsearch ISO8601 parser requires a literal T between the date and time, so + // longhand formats are needed if there's a space instead + return (example.indexOf('T') >= 0) ? Collections.singletonList("ISO8601") : iso8601LikeFormatFromExample(example, " ", ""); + } + + static List iso8601LikeFormatFromExample(String example, String timeSeparator, String timezoneSeparator) { + + StringBuilder builder = new StringBuilder("yyyy-MM-dd"); + builder.append(timeSeparator).append("HH:mm"); + + // Seconds are optional in ISO8601 + if (example.length() > builder.length() && example.charAt(builder.length()) == ':') { + builder.append(":ss"); + } + + if (example.length() > builder.length()) { + + // Add fractional seconds pattern if appropriate + char nextChar = example.charAt(builder.length()); + if (FRACTIONAL_SECOND_SEPARATORS.indexOf(nextChar) >= 0) { + builder.append(nextChar); + for (int pos = builder.length(); pos < example.length(); ++pos) { + if (Character.isDigit(example.charAt(pos))) { + builder.append('S'); + } else { + break; + } + } + } + + // Add timezone if appropriate - in the case of a literal Z, XX is preferred + if (example.length() > builder.length()) { + builder.append(timezoneSeparator).append((example.indexOf(':', builder.length()) > 0) ? "XXX" : "XX"); + } + } else { + // This method should not have been called if the example didn't include the bare minimum of date and time + assert example.length() == builder.length() : "Expected [" + example + "] and [" + builder + "] to be the same length"; + } + + return Collections.singletonList(builder.toString()); + } + + static List adjustTrailingTimezoneFromExample(String example, String formatWithSecondsAndXX) { + return Collections.singletonList( + TRAILING_OFFSET_WITHOUT_COLON_FINDER.matcher(example).find() ? formatWithSecondsAndXX : formatWithSecondsAndXX + "X"); + } + + private static String adjustFractionalSecondsFromEndOfExample(String example, String formatNoFraction) { + + Matcher matcher = FRACTIONAL_SECOND_INTERPRETER.matcher(example); + return matcher.find() + ? (formatNoFraction + matcher.group(1).charAt(0) + "SSSSSSSSS".substring(0, matcher.group(2).length())) + : formatNoFraction; + } + + static List expandDayAndAdjustFractionalSecondsFromExample(String example, String formatWithddAndNoFraction) { + + String formatWithdd = adjustFractionalSecondsFromEndOfExample(example, formatWithddAndNoFraction); + return Arrays.asList(formatWithdd, formatWithdd.replace(" dd", " d")); + } + + static List indeterminateDayMonthFormatFromExample(String example) { + + StringBuilder builder = new StringBuilder(); + int examplePos = 0; + + // INDETERMINATE_FIELD_PLACEHOLDER here could represent either a day number (d) or month number (M) - it + // will get changed later based on evidence from many examples + for (Character patternChar + : Arrays.asList(INDETERMINATE_FIELD_PLACEHOLDER, INDETERMINATE_FIELD_PLACEHOLDER, 'y', 'H', 'm', 's')) { + + boolean foundDigit = false; + while (examplePos < example.length() && Character.isDigit(example.charAt(examplePos))) { + foundDigit = true; + builder.append(patternChar); + ++examplePos; + } + + if (patternChar == 's' || examplePos >= example.length() || foundDigit == false) { + break; + } + + builder.append(example.charAt(examplePos)); + ++examplePos; + } + + String format = builder.toString(); + // The Grok pattern should ensure we got at least as far as the year + assert format.contains("yy") : "Unexpected format [" + format + "] from example [" + example + "]"; + + if (examplePos < example.length()) { + // If we haven't consumed the whole example then we should have got as far as + // the (whole) seconds, and the bit afterwards should be the fractional seconds + assert builder.toString().endsWith("ss") : "Unexpected format [" + format + "] from example [" + example + "]"; + format = adjustFractionalSecondsFromEndOfExample(example, format); + } + + assert Character.isLetter(format.charAt(format.length() - 1)) + : "Unexpected format [" + format + "] from example [" + example + "]"; + assert format.length() == example.length() : "Unexpected format [" + format + "] from example [" + example + "]"; + + return Collections.singletonList(format); } } } diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/XmlFileStructureFinder.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/XmlFileStructureFinder.java index 53550ebf18dd3..d2572b7fd2085 100644 --- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/XmlFileStructureFinder.java +++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/XmlFileStructureFinder.java @@ -8,7 +8,6 @@ import org.elasticsearch.common.collect.Tuple; import org.elasticsearch.xpack.core.ml.filestructurefinder.FieldStats; import org.elasticsearch.xpack.core.ml.filestructurefinder.FileStructure; -import org.elasticsearch.xpack.ml.filestructurefinder.TimestampFormatFinder.TimestampMatch; import org.w3c.dom.Document; import org.w3c.dom.NamedNodeMap; import org.w3c.dom.Node; @@ -79,6 +78,9 @@ static XmlFileStructureFinder makeXmlFileStructureFinder(List explanatio ++linesConsumed; } + // null to allow GC before timestamp search + sampleDocEnds = null; + // If we get here the XML parser should have confirmed this assert messagePrefix.charAt(0) == '<'; String topLevelTag = messagePrefix.substring(1); @@ -91,17 +93,17 @@ static XmlFileStructureFinder makeXmlFileStructureFinder(List explanatio .setNumMessagesAnalyzed(sampleRecords.size()) .setMultilineStartPattern("^\\s*<" + topLevelTag); - Tuple timeField = + Tuple timeField = FileStructureUtils.guessTimestampField(explanation, sampleRecords, overrides, timeoutChecker); if (timeField != null) { boolean needClientTimeZone = timeField.v2().hasTimezoneDependentParsing(); structureBuilder.setTimestampField(timeField.v1()) - .setJodaTimestampFormats(timeField.v2().jodaTimestampFormats) - .setJavaTimestampFormats(timeField.v2().javaTimestampFormats) + .setJodaTimestampFormats(timeField.v2().getJodaTimestampFormats()) + .setJavaTimestampFormats(timeField.v2().getJavaTimestampFormats()) .setNeedClientTimezone(needClientTimeZone) - .setIngestPipeline(FileStructureUtils.makeIngestPipelineDefinition(null, topLevelTag + "." + timeField.v1(), - timeField.v2().javaTimestampFormats, needClientTimeZone)); + .setIngestPipeline(FileStructureUtils.makeIngestPipelineDefinition(null, Collections.emptyMap(), + topLevelTag + "." + timeField.v1(), timeField.v2().getJavaTimestampFormats(), needClientTimeZone)); } Tuple, SortedMap> mappingsAndFieldStats = diff --git a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/DelimitedFileStructureFinderTests.java b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/DelimitedFileStructureFinderTests.java index 10bdf0d16d8eb..280a50324e447 100644 --- a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/DelimitedFileStructureFinderTests.java +++ b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/DelimitedFileStructureFinderTests.java @@ -42,7 +42,7 @@ public void testCreateConfigsGivenCompleteCsv() throws Exception { assertEquals(hasByteOrderMarker, structure.getHasByteOrderMarker()); } assertEquals("^\"?time\"?,\"?message\"?", structure.getExcludeLinesPattern()); - assertEquals("^\"?\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}", structure.getMultilineStartPattern()); + assertEquals("^\"?\\d{4}-\\d{2}-\\d{2}[T ]\\d{2}:\\d{2}", structure.getMultilineStartPattern()); assertEquals(Character.valueOf(','), structure.getDelimiter()); assertEquals(Character.valueOf('"'), structure.getQuote()); assertTrue(structure.getHasHeaderRow()); @@ -77,7 +77,7 @@ public void testCreateConfigsGivenCompleteCsvAndColumnNamesOverride() throws Exc assertEquals(hasByteOrderMarker, structure.getHasByteOrderMarker()); } assertEquals("^\"?time\"?,\"?message\"?", structure.getExcludeLinesPattern()); - assertEquals("^\"?\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}", structure.getMultilineStartPattern()); + assertEquals("^\"?\\d{4}-\\d{2}-\\d{2}[T ]\\d{2}:\\d{2}", structure.getMultilineStartPattern()); assertEquals(Character.valueOf(','), structure.getDelimiter()); assertEquals(Character.valueOf('"'), structure.getQuote()); assertTrue(structure.getHasHeaderRow()); @@ -147,7 +147,7 @@ public void testCreateConfigsGivenCsvWithIncompleteLastRecord() throws Exception assertEquals(hasByteOrderMarker, structure.getHasByteOrderMarker()); } assertEquals("^\"?message\"?,\"?time\"?,\"?count\"?", structure.getExcludeLinesPattern()); - assertEquals("^.*?,\"?\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}", structure.getMultilineStartPattern()); + assertEquals("^.*?,\"?\\d{4}-\\d{2}-\\d{2}[T ]\\d{2}:\\d{2}", structure.getMultilineStartPattern()); assertEquals(Character.valueOf(','), structure.getDelimiter()); assertEquals(Character.valueOf('"'), structure.getQuote()); assertTrue(structure.getHasHeaderRow()); @@ -185,7 +185,7 @@ public void testCreateConfigsGivenCsvWithTrailingNulls() throws Exception { "\"?RatecodeID\"?,\"?store_and_fwd_flag\"?,\"?PULocationID\"?,\"?DOLocationID\"?,\"?payment_type\"?,\"?fare_amount\"?," + "\"?extra\"?,\"?mta_tax\"?,\"?tip_amount\"?,\"?tolls_amount\"?,\"?improvement_surcharge\"?,\"?total_amount\"?,\"?\"?,\"?\"?", structure.getExcludeLinesPattern()); - assertEquals("^.*?,\"?\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2}", structure.getMultilineStartPattern()); + assertEquals("^.*?,\"?\\d{4}-\\d{2}-\\d{2}[T ]\\d{2}:\\d{2}", structure.getMultilineStartPattern()); assertEquals(Character.valueOf(','), structure.getDelimiter()); assertEquals(Character.valueOf('"'), structure.getQuote()); assertTrue(structure.getHasHeaderRow()); @@ -230,7 +230,7 @@ public void testCreateConfigsGivenCsvWithTrailingNullsAndOverriddenTimeField() t "\"?RatecodeID\"?,\"?store_and_fwd_flag\"?,\"?PULocationID\"?,\"?DOLocationID\"?,\"?payment_type\"?,\"?fare_amount\"?," + "\"?extra\"?,\"?mta_tax\"?,\"?tip_amount\"?,\"?tolls_amount\"?,\"?improvement_surcharge\"?,\"?total_amount\"?,\"?\"?,\"?\"?", structure.getExcludeLinesPattern()); - assertEquals("^.*?,.*?,\"?\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2}", structure.getMultilineStartPattern()); + assertEquals("^.*?,.*?,\"?\\d{4}-\\d{2}-\\d{2}[T ]\\d{2}:\\d{2}", structure.getMultilineStartPattern()); assertEquals(Character.valueOf(','), structure.getDelimiter()); assertEquals(Character.valueOf('"'), structure.getQuote()); assertTrue(structure.getHasHeaderRow()); @@ -270,7 +270,7 @@ public void testCreateConfigsGivenCsvWithTrailingNullsExceptHeader() throws Exce "\"?RatecodeID\"?,\"?store_and_fwd_flag\"?,\"?PULocationID\"?,\"?DOLocationID\"?,\"?payment_type\"?,\"?fare_amount\"?," + "\"?extra\"?,\"?mta_tax\"?,\"?tip_amount\"?,\"?tolls_amount\"?,\"?improvement_surcharge\"?,\"?total_amount\"?", structure.getExcludeLinesPattern()); - assertEquals("^.*?,\"?\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2}", structure.getMultilineStartPattern()); + assertEquals("^.*?,\"?\\d{4}-\\d{2}-\\d{2}[T ]\\d{2}:\\d{2}", structure.getMultilineStartPattern()); assertEquals(Character.valueOf(','), structure.getDelimiter()); assertEquals(Character.valueOf('"'), structure.getQuote()); assertTrue(structure.getHasHeaderRow()); @@ -317,7 +317,7 @@ public void testCreateConfigsGivenCsvWithTrailingNullsExceptHeaderAndColumnNames "\"?RatecodeID\"?,\"?store_and_fwd_flag\"?,\"?PULocationID\"?,\"?DOLocationID\"?,\"?payment_type\"?,\"?fare_amount\"?," + "\"?extra\"?,\"?mta_tax\"?,\"?tip_amount\"?,\"?tolls_amount\"?,\"?improvement_surcharge\"?,\"?total_amount\"?", structure.getExcludeLinesPattern()); - assertEquals("^.*?,\"?\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2}", structure.getMultilineStartPattern()); + assertEquals("^.*?,\"?\\d{4}-\\d{2}-\\d{2}[T ]\\d{2}:\\d{2}", structure.getMultilineStartPattern()); assertEquals(Character.valueOf(','), structure.getDelimiter()); assertEquals(Character.valueOf('"'), structure.getQuote()); assertTrue(structure.getHasHeaderRow()); diff --git a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/FileStructureUtilsTests.java b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/FileStructureUtilsTests.java index 264521e68fb51..c0adccd0eb477 100644 --- a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/FileStructureUtilsTests.java +++ b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/FileStructureUtilsTests.java @@ -7,7 +7,6 @@ import org.elasticsearch.common.collect.Tuple; import org.elasticsearch.xpack.core.ml.filestructurefinder.FieldStats; -import org.elasticsearch.xpack.ml.filestructurefinder.TimestampFormatFinder.TimestampMatch; import java.util.Arrays; import java.util.Collections; @@ -35,12 +34,12 @@ public void testMoreLikelyGivenKeyword() { public void testGuessTimestampGivenSingleSampleSingleField() { Map sample = Collections.singletonMap("field1", "2018-05-24T17:28:31,735"); - Tuple match = FileStructureUtils.guessTimestampField(explanation, Collections.singletonList(sample), + Tuple match = FileStructureUtils.guessTimestampField(explanation, Collections.singletonList(sample), EMPTY_OVERRIDES, NOOP_TIMEOUT_CHECKER); assertNotNull(match); assertEquals("field1", match.v1()); - assertThat(match.v2().javaTimestampFormats, contains("yyyy-MM-dd'T'HH:mm:ss,SSS")); - assertEquals("TIMESTAMP_ISO8601", match.v2().grokPatternName); + assertThat(match.v2().getJavaTimestampFormats(), contains("ISO8601")); + assertEquals("TIMESTAMP_ISO8601", match.v2().getGrokPatternName()); } public void testGuessTimestampGivenSingleSampleSingleFieldAndConsistentTimeFieldOverride() { @@ -48,12 +47,12 @@ public void testGuessTimestampGivenSingleSampleSingleFieldAndConsistentTimeField FileStructureOverrides overrides = FileStructureOverrides.builder().setTimestampField("field1").build(); Map sample = Collections.singletonMap("field1", "2018-05-24T17:28:31,735"); - Tuple match = FileStructureUtils.guessTimestampField(explanation, Collections.singletonList(sample), + Tuple match = FileStructureUtils.guessTimestampField(explanation, Collections.singletonList(sample), overrides, NOOP_TIMEOUT_CHECKER); assertNotNull(match); assertEquals("field1", match.v1()); - assertThat(match.v2().javaTimestampFormats, contains("yyyy-MM-dd'T'HH:mm:ss,SSS")); - assertEquals("TIMESTAMP_ISO8601", match.v2().grokPatternName); + assertThat(match.v2().getJavaTimestampFormats(), contains("ISO8601")); + assertEquals("TIMESTAMP_ISO8601", match.v2().getGrokPatternName()); } public void testGuessTimestampGivenSingleSampleSingleFieldAndImpossibleTimeFieldOverride() { @@ -73,12 +72,12 @@ public void testGuessTimestampGivenSingleSampleSingleFieldAndConsistentTimeForma FileStructureOverrides overrides = FileStructureOverrides.builder().setTimestampFormat("ISO8601").build(); Map sample = Collections.singletonMap("field1", "2018-05-24T17:28:31,735"); - Tuple match = FileStructureUtils.guessTimestampField(explanation, Collections.singletonList(sample), + Tuple match = FileStructureUtils.guessTimestampField(explanation, Collections.singletonList(sample), overrides, NOOP_TIMEOUT_CHECKER); assertNotNull(match); assertEquals("field1", match.v1()); - assertThat(match.v2().javaTimestampFormats, contains("yyyy-MM-dd'T'HH:mm:ss,SSS")); - assertEquals("TIMESTAMP_ISO8601", match.v2().grokPatternName); + assertThat(match.v2().getJavaTimestampFormats(), contains("ISO8601")); + assertEquals("TIMESTAMP_ISO8601", match.v2().getGrokPatternName()); } public void testGuessTimestampGivenSingleSampleSingleFieldAndImpossibleTimeFormatOverride() { @@ -97,18 +96,18 @@ public void testGuessTimestampGivenSingleSampleSingleFieldAndImpossibleTimeForma public void testGuessTimestampGivenSamplesWithSameSingleTimeField() { Map sample1 = Collections.singletonMap("field1", "2018-05-24T17:28:31,735"); Map sample2 = Collections.singletonMap("field1", "2018-05-24T17:33:39,406"); - Tuple match = FileStructureUtils.guessTimestampField(explanation, Arrays.asList(sample1, sample2), + Tuple match = FileStructureUtils.guessTimestampField(explanation, Arrays.asList(sample1, sample2), EMPTY_OVERRIDES, NOOP_TIMEOUT_CHECKER); assertNotNull(match); assertEquals("field1", match.v1()); - assertThat(match.v2().javaTimestampFormats, contains("yyyy-MM-dd'T'HH:mm:ss,SSS")); - assertEquals("TIMESTAMP_ISO8601", match.v2().grokPatternName); + assertThat(match.v2().getJavaTimestampFormats(), contains("ISO8601")); + assertEquals("TIMESTAMP_ISO8601", match.v2().getGrokPatternName()); } public void testGuessTimestampGivenSamplesWithOneSingleTimeFieldDifferentFormat() { Map sample1 = Collections.singletonMap("field1", "2018-05-24T17:28:31,735"); - Map sample2 = Collections.singletonMap("field1", "2018-05-24 17:33:39,406"); - Tuple match = FileStructureUtils.guessTimestampField(explanation, Arrays.asList(sample1, sample2), + Map sample2 = Collections.singletonMap("field1", "Thu May 24 17:33:39 2018"); + Tuple match = FileStructureUtils.guessTimestampField(explanation, Arrays.asList(sample1, sample2), EMPTY_OVERRIDES, NOOP_TIMEOUT_CHECKER); assertNull(match); } @@ -116,7 +115,7 @@ public void testGuessTimestampGivenSamplesWithOneSingleTimeFieldDifferentFormat( public void testGuessTimestampGivenSamplesWithDifferentSingleTimeField() { Map sample1 = Collections.singletonMap("field1", "2018-05-24T17:28:31,735"); Map sample2 = Collections.singletonMap("another_field", "2018-05-24T17:33:39,406"); - Tuple match = FileStructureUtils.guessTimestampField(explanation, Arrays.asList(sample1, sample2), + Tuple match = FileStructureUtils.guessTimestampField(explanation, Arrays.asList(sample1, sample2), EMPTY_OVERRIDES, NOOP_TIMEOUT_CHECKER); assertNull(match); } @@ -126,12 +125,12 @@ public void testGuessTimestampGivenSingleSampleManyFieldsOneTimeFormat() { sample.put("foo", "not a time"); sample.put("time", "2018-05-24 17:28:31,735"); sample.put("bar", 42); - Tuple match = FileStructureUtils.guessTimestampField(explanation, Collections.singletonList(sample), + Tuple match = FileStructureUtils.guessTimestampField(explanation, Collections.singletonList(sample), EMPTY_OVERRIDES, NOOP_TIMEOUT_CHECKER); assertNotNull(match); assertEquals("time", match.v1()); - assertThat(match.v2().javaTimestampFormats, contains("yyyy-MM-dd HH:mm:ss,SSS")); - assertEquals("TIMESTAMP_ISO8601", match.v2().grokPatternName); + assertThat(match.v2().getJavaTimestampFormats(), contains("yyyy-MM-dd HH:mm:ss,SSS")); + assertEquals("TIMESTAMP_ISO8601", match.v2().getGrokPatternName()); } public void testGuessTimestampGivenSamplesWithManyFieldsSameSingleTimeFormat() { @@ -143,12 +142,12 @@ public void testGuessTimestampGivenSamplesWithManyFieldsSameSingleTimeFormat() { sample2.put("foo", "whatever"); sample2.put("time", "2018-05-29 11:53:02,837"); sample2.put("bar", 17); - Tuple match = FileStructureUtils.guessTimestampField(explanation, Arrays.asList(sample1, sample2), + Tuple match = FileStructureUtils.guessTimestampField(explanation, Arrays.asList(sample1, sample2), EMPTY_OVERRIDES, NOOP_TIMEOUT_CHECKER); assertNotNull(match); assertEquals("time", match.v1()); - assertThat(match.v2().javaTimestampFormats, contains("yyyy-MM-dd HH:mm:ss,SSS")); - assertEquals("TIMESTAMP_ISO8601", match.v2().grokPatternName); + assertThat(match.v2().getJavaTimestampFormats(), contains("yyyy-MM-dd HH:mm:ss,SSS")); + assertEquals("TIMESTAMP_ISO8601", match.v2().getGrokPatternName()); } public void testGuessTimestampGivenSamplesWithManyFieldsSameTimeFieldDifferentTimeFormat() { @@ -160,7 +159,7 @@ public void testGuessTimestampGivenSamplesWithManyFieldsSameTimeFieldDifferentTi sample2.put("foo", "whatever"); sample2.put("time", "May 29 2018 11:53:02"); sample2.put("bar", 17); - Tuple match = FileStructureUtils.guessTimestampField(explanation, Arrays.asList(sample1, sample2), + Tuple match = FileStructureUtils.guessTimestampField(explanation, Arrays.asList(sample1, sample2), EMPTY_OVERRIDES, NOOP_TIMEOUT_CHECKER); assertNull(match); } @@ -174,12 +173,12 @@ public void testGuessTimestampGivenSamplesWithManyFieldsSameSingleTimeFormatDist sample2.put("red_herring", "whatever"); sample2.put("time", "2018-05-29 11:53:02,837"); sample2.put("bar", 17); - Tuple match = FileStructureUtils.guessTimestampField(explanation, Arrays.asList(sample1, sample2), + Tuple match = FileStructureUtils.guessTimestampField(explanation, Arrays.asList(sample1, sample2), EMPTY_OVERRIDES, NOOP_TIMEOUT_CHECKER); assertNotNull(match); assertEquals("time", match.v1()); - assertThat(match.v2().javaTimestampFormats, contains("yyyy-MM-dd HH:mm:ss,SSS")); - assertEquals("TIMESTAMP_ISO8601", match.v2().grokPatternName); + assertThat(match.v2().getJavaTimestampFormats(), contains("yyyy-MM-dd HH:mm:ss,SSS")); + assertEquals("TIMESTAMP_ISO8601", match.v2().getGrokPatternName()); } public void testGuessTimestampGivenSamplesWithManyFieldsSameSingleTimeFormatDistractionAfter() { @@ -191,12 +190,12 @@ public void testGuessTimestampGivenSamplesWithManyFieldsSameSingleTimeFormatDist sample2.put("foo", "whatever"); sample2.put("time", "May 29 2018 11:53:02"); sample2.put("red_herring", "17"); - Tuple match = FileStructureUtils.guessTimestampField(explanation, Arrays.asList(sample1, sample2), + Tuple match = FileStructureUtils.guessTimestampField(explanation, Arrays.asList(sample1, sample2), EMPTY_OVERRIDES, NOOP_TIMEOUT_CHECKER); assertNotNull(match); assertEquals("time", match.v1()); - assertThat(match.v2().javaTimestampFormats, contains("MMM dd yyyy HH:mm:ss", "MMM d yyyy HH:mm:ss")); - assertEquals("CISCOTIMESTAMP", match.v2().grokPatternName); + assertThat(match.v2().getJavaTimestampFormats(), contains("MMM dd yyyy HH:mm:ss", "MMM d yyyy HH:mm:ss")); + assertEquals("CISCOTIMESTAMP", match.v2().getGrokPatternName()); } public void testGuessTimestampGivenSamplesWithManyFieldsInconsistentTimeFields() { @@ -208,7 +207,7 @@ public void testGuessTimestampGivenSamplesWithManyFieldsInconsistentTimeFields() sample2.put("foo", "whatever"); sample2.put("time2", "May 29 2018 11:53:02"); sample2.put("bar", 42); - Tuple match = FileStructureUtils.guessTimestampField(explanation, Arrays.asList(sample1, sample2), + Tuple match = FileStructureUtils.guessTimestampField(explanation, Arrays.asList(sample1, sample2), EMPTY_OVERRIDES, NOOP_TIMEOUT_CHECKER); assertNull(match); } @@ -224,12 +223,12 @@ public void testGuessTimestampGivenSamplesWithManyFieldsInconsistentAndConsisten sample2.put("time2", "May 10 2018 11:53:02"); sample2.put("time3", "Thu, May 10 2018 11:53:02"); sample2.put("bar", 42); - Tuple match = FileStructureUtils.guessTimestampField(explanation, Arrays.asList(sample1, sample2), + Tuple match = FileStructureUtils.guessTimestampField(explanation, Arrays.asList(sample1, sample2), EMPTY_OVERRIDES, NOOP_TIMEOUT_CHECKER); assertNotNull(match); assertEquals("time2", match.v1()); - assertThat(match.v2().javaTimestampFormats, contains("MMM dd yyyy HH:mm:ss", "MMM d yyyy HH:mm:ss")); - assertEquals("CISCOTIMESTAMP", match.v2().grokPatternName); + assertThat(match.v2().getJavaTimestampFormats(), contains("MMM dd yyyy HH:mm:ss", "MMM d yyyy HH:mm:ss")); + assertEquals("CISCOTIMESTAMP", match.v2().getGrokPatternName()); } public void testGuessMappingGivenNothing() { @@ -273,7 +272,9 @@ public void testGuessMappingGivenLong() { } public void testGuessMappingGivenDate() { - Map expected = Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, "date"); + Map expected = new HashMap<>(); + expected.put(FileStructureUtils.MAPPING_TYPE_SETTING, "date"); + expected.put(FileStructureUtils.MAPPING_FORMAT_SETTING, "iso8601"); assertEquals(expected, guessMapping(explanation, "foo", Arrays.asList("2018-06-11T13:26:47Z", "2018-06-11T13:27:12Z"))); } @@ -347,18 +348,19 @@ public void testGuessMappingsAndCalculateFieldStats() { public void testMakeIngestPipelineDefinitionGivenStructuredWithoutTimestamp() { - assertNull(FileStructureUtils.makeIngestPipelineDefinition(null, null, null, false)); + assertNull(FileStructureUtils.makeIngestPipelineDefinition(null, Collections.emptyMap(), null, null, false)); } @SuppressWarnings("unchecked") public void testMakeIngestPipelineDefinitionGivenStructuredWithTimestamp() { String timestampField = randomAlphaOfLength(10); - List timestampFormats = randomFrom(TimestampFormatFinder.ORDERED_CANDIDATE_FORMATS).javaTimestampFormats; + List timestampFormats = randomFrom(Collections.singletonList("ISO8601"), + Arrays.asList("EEE MMM dd HH:mm:ss yyyy", "EEE MMM d HH:mm:ss yyyy")); boolean needClientTimezone = randomBoolean(); - Map pipeline = - FileStructureUtils.makeIngestPipelineDefinition(null, timestampField, timestampFormats, needClientTimezone); + Map pipeline = FileStructureUtils.makeIngestPipelineDefinition(null, Collections.emptyMap(), timestampField, + timestampFormats, needClientTimezone); assertNotNull(pipeline); assertEquals("Ingest pipeline created by file structure finder", pipeline.remove("description")); @@ -382,11 +384,12 @@ public void testMakeIngestPipelineDefinitionGivenSemiStructured() { String grokPattern = randomAlphaOfLength(100); String timestampField = randomAlphaOfLength(10); - List timestampFormats = randomFrom(TimestampFormatFinder.ORDERED_CANDIDATE_FORMATS).javaTimestampFormats; + List timestampFormats = randomFrom(Collections.singletonList("ISO8601"), + Arrays.asList("EEE MMM dd HH:mm:ss yyyy", "EEE MMM d HH:mm:ss yyyy")); boolean needClientTimezone = randomBoolean(); - Map pipeline = - FileStructureUtils.makeIngestPipelineDefinition(grokPattern, timestampField, timestampFormats, needClientTimezone); + Map pipeline = FileStructureUtils.makeIngestPipelineDefinition(grokPattern, Collections.emptyMap(), timestampField, + timestampFormats, needClientTimezone); assertNotNull(pipeline); assertEquals("Ingest pipeline created by file structure finder", pipeline.remove("description")); diff --git a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/GrokPatternCreatorTests.java b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/GrokPatternCreatorTests.java index dc48662fb35f7..7e6363602dcdd 100644 --- a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/GrokPatternCreatorTests.java +++ b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/GrokPatternCreatorTests.java @@ -43,7 +43,7 @@ public void testPopulatePrefacesAndEpiloguesGivenTimestamp() { Collection prefaces = new ArrayList<>(); Collection epilogues = new ArrayList<>(); - candidate.processCaptures(fieldNameCountStore, matchingStrings, prefaces, epilogues, null, null, NOOP_TIMEOUT_CHECKER); + candidate.processCaptures(explanation, fieldNameCountStore, matchingStrings, prefaces, epilogues, null, null, NOOP_TIMEOUT_CHECKER); assertThat(prefaces, containsInAnyOrder("[", "[", "junk [", "[")); assertThat(epilogues, containsInAnyOrder("] DEBUG ", "] ERROR ", "] INFO ", "] DEBUG ")); @@ -60,7 +60,7 @@ public void testPopulatePrefacesAndEpiloguesGivenEmailAddress() { Collection prefaces = new ArrayList<>(); Collection epilogues = new ArrayList<>(); - candidate.processCaptures(fieldNameCountStore, matchingStrings, prefaces, epilogues, null, null, NOOP_TIMEOUT_CHECKER); + candidate.processCaptures(explanation, fieldNameCountStore, matchingStrings, prefaces, epilogues, null, null, NOOP_TIMEOUT_CHECKER); assertThat(prefaces, containsInAnyOrder("before ", "abc ", "")); assertThat(epilogues, containsInAnyOrder(" after", " xyz", "")); @@ -73,7 +73,8 @@ public void testAppendBestGrokMatchForStringsGivenTimestampsAndLogLevels() { "junk [2018-01-22T07:33:23] INFO ", "[2018-01-21T03:33:23] DEBUG "); - GrokPatternCreator grokPatternCreator = new GrokPatternCreator(explanation, snippets, null, null, NOOP_TIMEOUT_CHECKER); + GrokPatternCreator grokPatternCreator = new GrokPatternCreator(explanation, snippets, null, null, Collections.emptyMap(), + NOOP_TIMEOUT_CHECKER); grokPatternCreator.appendBestGrokMatchForStrings(false, snippets, false, 0); assertEquals(".*?\\[%{TIMESTAMP_ISO8601:extra_timestamp}\\] %{LOGLEVEL:loglevel} ", @@ -87,7 +88,8 @@ public void testAppendBestGrokMatchForStringsGivenNumbersInBrackets() { " (4)", " (-5) "); - GrokPatternCreator grokPatternCreator = new GrokPatternCreator(explanation, snippets, null, null, NOOP_TIMEOUT_CHECKER); + GrokPatternCreator grokPatternCreator = new GrokPatternCreator(explanation, snippets, null, null, Collections.emptyMap(), + NOOP_TIMEOUT_CHECKER); grokPatternCreator.appendBestGrokMatchForStrings(false, snippets, false, 0); assertEquals(".*?\\(%{INT:field}\\).*?", grokPatternCreator.getOverallGrokPatternBuilder().toString()); @@ -99,7 +101,8 @@ public void testAppendBestGrokMatchForStringsGivenNegativeNumbersWithoutBreak() "prior to-3", "-4"); - GrokPatternCreator grokPatternCreator = new GrokPatternCreator(explanation, snippets, null, null, NOOP_TIMEOUT_CHECKER); + GrokPatternCreator grokPatternCreator = new GrokPatternCreator(explanation, snippets, null, null, Collections.emptyMap(), + NOOP_TIMEOUT_CHECKER); grokPatternCreator.appendBestGrokMatchForStrings(false, snippets, false, 0); // It seems sensible that we don't detect these suffices as either base 10 or base 16 numbers @@ -113,7 +116,8 @@ public void testAppendBestGrokMatchForStringsGivenHexNumbers() { " -123", "1f is hex"); - GrokPatternCreator grokPatternCreator = new GrokPatternCreator(explanation, snippets, null, null, NOOP_TIMEOUT_CHECKER); + GrokPatternCreator grokPatternCreator = new GrokPatternCreator(explanation, snippets, null, null, Collections.emptyMap(), + NOOP_TIMEOUT_CHECKER); grokPatternCreator.appendBestGrokMatchForStrings(false, snippets, false, 0); assertEquals(".*?%{BASE16NUM:field}.*?", grokPatternCreator.getOverallGrokPatternBuilder().toString()); @@ -124,7 +128,8 @@ public void testAppendBestGrokMatchForStringsGivenHostnamesWithNumbers() { Collection snippets = Arrays.asList(" mappings = new HashMap<>(); - GrokPatternCreator grokPatternCreator = new GrokPatternCreator(explanation, sampleMessages, mappings, null, + GrokPatternCreator grokPatternCreator = new GrokPatternCreator(explanation, sampleMessages, mappings, null, Collections.emptyMap(), NOOP_TIMEOUT_CHECKER); assertEquals("%{SYSLOGTIMESTAMP:timestamp} .*? .*?\\[%{INT:field}\\]: %{LOGLEVEL:loglevel} \\(.*? .*? .*?\\) .*? " + @@ -216,7 +225,7 @@ public void testCreateGrokPatternFromExamplesGivenCatalinaLogs() { "Invalid chunk ignored."); Map mappings = new HashMap<>(); - GrokPatternCreator grokPatternCreator = new GrokPatternCreator(explanation, sampleMessages, mappings, null, + GrokPatternCreator grokPatternCreator = new GrokPatternCreator(explanation, sampleMessages, mappings, null, Collections.emptyMap(), NOOP_TIMEOUT_CHECKER); assertEquals("%{CATALINA_DATESTAMP:timestamp} .*? .*?\\n%{LOGLEVEL:loglevel}: .*", @@ -239,7 +248,7 @@ public void testCreateGrokPatternFromExamplesGivenMultiTimestampLogs() { "Info\tsshd\tsubsystem request for sftp"); Map mappings = new HashMap<>(); - GrokPatternCreator grokPatternCreator = new GrokPatternCreator(explanation, sampleMessages, mappings, null, + GrokPatternCreator grokPatternCreator = new GrokPatternCreator(explanation, sampleMessages, mappings, null, Collections.emptyMap(), NOOP_TIMEOUT_CHECKER); assertEquals("%{INT:field}\\t%{TIMESTAMP_ISO8601:timestamp}\\t%{TIMESTAMP_ISO8601:extra_timestamp}\\t%{INT:field2}\\t.*?\\t" + @@ -247,7 +256,101 @@ public void testCreateGrokPatternFromExamplesGivenMultiTimestampLogs() { grokPatternCreator.createGrokPatternFromExamples("TIMESTAMP_ISO8601", "timestamp")); assertEquals(5, mappings.size()); assertEquals(Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, "long"), mappings.get("field")); - assertEquals(Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, "date"), mappings.get("extra_timestamp")); + Map expectedDateMapping = new HashMap<>(); + expectedDateMapping.put(FileStructureUtils.MAPPING_TYPE_SETTING, "date"); + expectedDateMapping.put(FileStructureUtils.MAPPING_FORMAT_SETTING, "iso8601"); + assertEquals(expectedDateMapping, mappings.get("extra_timestamp")); + assertEquals(Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, "long"), mappings.get("field2")); + assertEquals(Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, "ip"), mappings.get("ipaddress")); + assertEquals(Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, "keyword"), mappings.get("loglevel")); + } + + public void testCreateGrokPatternFromExamplesGivenMultiTimestampLogsAndIndeterminateFormat() { + + // Two timestamps: one ISO8601, one indeterminate day/month + Collection sampleMessages = Arrays.asList( + "559550912540598297\t2016-04-20T14:06:53\t20/04/2016 21:06:53,123456\t38545844\tserv02nw07\t192.168.114.28\tAuthpriv\t" + + "Info\tsshd\tsubsystem request for sftp", + "559550912548986880\t2016-04-20T14:06:53\t20/04/2016 21:06:53,123456\t9049724\tserv02nw03\t10.120.48.147\tAuthpriv\t" + + "Info\tsshd\tsubsystem request for sftp", + "559550912548986887\t2016-04-20T14:06:53\t20/04/2016 21:06:53,123456\t884343\tserv02tw03\t192.168.121.189\tAuthpriv\t" + + "Info\tsshd\tsubsystem request for sftp", + "559550912603512850\t2016-04-20T14:06:53\t20/04/2016 21:06:53,123456\t8907014\tserv02nw01\t192.168.118.208\tAuthpriv\t" + + "Info\tsshd\tsubsystem request for sftp"); + + Map mappings = new HashMap<>(); + GrokPatternCreator grokPatternCreator = new GrokPatternCreator(explanation, sampleMessages, mappings, null, Collections.emptyMap(), + NOOP_TIMEOUT_CHECKER); + + assertEquals("%{INT:field}\\t%{TIMESTAMP_ISO8601:timestamp}\\t%{DATESTAMP:extra_timestamp}\\t%{INT:field2}\\t.*?\\t" + + "%{IP:ipaddress}\\t.*?\\t%{LOGLEVEL:loglevel}\\t.*", + grokPatternCreator.createGrokPatternFromExamples("TIMESTAMP_ISO8601", "timestamp")); + assertEquals(5, mappings.size()); + assertEquals(Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, "long"), mappings.get("field")); + Map expectedDateMapping = new HashMap<>(); + expectedDateMapping.put(FileStructureUtils.MAPPING_TYPE_SETTING, "date"); + expectedDateMapping.put(FileStructureUtils.MAPPING_FORMAT_SETTING, "dd/MM/yyyy HH:mm:ss,SSSSSS"); + assertEquals(expectedDateMapping, mappings.get("extra_timestamp")); + assertEquals(Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, "long"), mappings.get("field2")); + assertEquals(Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, "ip"), mappings.get("ipaddress")); + assertEquals(Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, "keyword"), mappings.get("loglevel")); + } + + public void testCreateGrokPatternFromExamplesGivenMultiTimestampLogsAndCustomDefinition() { + + // Two timestamps: one custom, one built-in + Collection sampleMessages = Arrays.asList( + "559550912540598297\t4/20/2016 2:06PM\t2016-04-20T21:06:53Z\t38545844\tserv02nw07\t192.168.114.28\tAuthpriv\t" + + "Info\tsshd\tsubsystem request for sftp", + "559550912548986880\t4/20/2016 2:06PM\t2016-04-20T21:06:53Z\t9049724\tserv02nw03\t10.120.48.147\tAuthpriv\t" + + "Info\tsshd\tsubsystem request for sftp", + "559550912548986887\t4/20/2016 2:06PM\t2016-04-20T21:06:53Z\t884343\tserv02tw03\t192.168.121.189\tAuthpriv\t" + + "Info\tsshd\tsubsystem request for sftp", + "559550912603512850\t4/20/2016 2:06PM\t2016-04-20T21:06:53Z\t8907014\tserv02nw01\t192.168.118.208\tAuthpriv\t" + + "Info\tsshd\tsubsystem request for sftp"); + + Map mappings = new HashMap<>(); + GrokPatternCreator grokPatternCreator = new GrokPatternCreator(explanation, sampleMessages, mappings, null, + Collections.singletonMap("CUSTOM_TIMESTAMP", "%{MONTHNUM}/%{MONTHDAY}/%{YEAR} %{HOUR}:%{MINUTE}(?:AM|PM)"), + NOOP_TIMEOUT_CHECKER); + + assertEquals("%{INT:field}\\t%{CUSTOM_TIMESTAMP:timestamp}\\t%{TIMESTAMP_ISO8601:extra_timestamp}\\t%{INT:field2}\\t.*?\\t" + + "%{IP:ipaddress}\\t.*?\\t%{LOGLEVEL:loglevel}\\t.*", + grokPatternCreator.createGrokPatternFromExamples("CUSTOM_TIMESTAMP", "timestamp")); + assertEquals(5, mappings.size()); + assertEquals(Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, "long"), mappings.get("field")); + Map expectedDateMapping = new HashMap<>(); + expectedDateMapping.put(FileStructureUtils.MAPPING_TYPE_SETTING, "date"); + expectedDateMapping.put(FileStructureUtils.MAPPING_FORMAT_SETTING, "iso8601"); + assertEquals(expectedDateMapping, mappings.get("extra_timestamp")); + assertEquals(Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, "long"), mappings.get("field2")); + assertEquals(Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, "ip"), mappings.get("ipaddress")); + assertEquals(Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, "keyword"), mappings.get("loglevel")); + } + + public void testCreateGrokPatternFromExamplesGivenTimestampAndTimeWithoutDate() { + + // Two timestamps: one with date, one without + Collection sampleMessages = Arrays.asList( + "559550912540598297\t2016-04-20T14:06:53\t21:06:53.123456\t38545844\tserv02nw07\t192.168.114.28\tAuthpriv\t" + + "Info\tsshd\tsubsystem request for sftp", + "559550912548986880\t2016-04-20T14:06:53\t21:06:53.123456\t9049724\tserv02nw03\t10.120.48.147\tAuthpriv\t" + + "Info\tsshd\tsubsystem request for sftp", + "559550912548986887\t2016-04-20T14:06:53\t21:06:53.123456\t884343\tserv02tw03\t192.168.121.189\tAuthpriv\t" + + "Info\tsshd\tsubsystem request for sftp", + "559550912603512850\t2016-04-20T14:06:53\t21:06:53.123456\t8907014\tserv02nw01\t192.168.118.208\tAuthpriv\t" + + "Info\tsshd\tsubsystem request for sftp"); + + Map mappings = new HashMap<>(); + GrokPatternCreator grokPatternCreator = new GrokPatternCreator(explanation, sampleMessages, mappings, null, Collections.emptyMap(), + NOOP_TIMEOUT_CHECKER); + + assertEquals("%{INT:field}\\t%{TIMESTAMP_ISO8601:timestamp}\\t%{TIME:time}\\t%{INT:field2}\\t.*?\\t" + + "%{IP:ipaddress}\\t.*?\\t%{LOGLEVEL:loglevel}\\t.*", + grokPatternCreator.createGrokPatternFromExamples("TIMESTAMP_ISO8601", "timestamp")); + assertEquals(5, mappings.size()); + assertEquals(Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, "long"), mappings.get("field")); + assertEquals(Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, "keyword"), mappings.get("time")); assertEquals(Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, "long"), mappings.get("field2")); assertEquals(Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, "ip"), mappings.get("ipaddress")); assertEquals(Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, "keyword"), mappings.get("loglevel")); @@ -273,7 +376,7 @@ public void testFindFullLineGrokPatternGivenApacheCombinedLogs() { "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1700.77 Safari/537.36\""); Map mappings = new HashMap<>(); - GrokPatternCreator grokPatternCreator = new GrokPatternCreator(explanation, sampleMessages, mappings, null, + GrokPatternCreator grokPatternCreator = new GrokPatternCreator(explanation, sampleMessages, mappings, null, Collections.emptyMap(), NOOP_TIMEOUT_CHECKER); assertEquals(new Tuple<>("timestamp", "%{COMBINEDAPACHELOG}"), @@ -304,7 +407,8 @@ public void testAdjustForPunctuationGivenCommonPrefix() { ",\"rule1\",\"Accept\",\"\",\"\",\"\",\"0000000000000000\"" ); - GrokPatternCreator grokPatternCreator = new GrokPatternCreator(explanation, snippets, null, null, NOOP_TIMEOUT_CHECKER); + GrokPatternCreator grokPatternCreator = new GrokPatternCreator(explanation, snippets, null, null, Collections.emptyMap(), + NOOP_TIMEOUT_CHECKER); Collection adjustedSnippets = grokPatternCreator.adjustForPunctuation(snippets); assertEquals("\",", grokPatternCreator.getOverallGrokPatternBuilder().toString()); @@ -321,7 +425,8 @@ public void testAdjustForPunctuationGivenNoCommonPrefix() { "was added by 'User1'(id:2) to servergroup 'GAME'(id:9)" ); - GrokPatternCreator grokPatternCreator = new GrokPatternCreator(explanation, snippets, null, null, NOOP_TIMEOUT_CHECKER); + GrokPatternCreator grokPatternCreator = new GrokPatternCreator(explanation, snippets, null, null, Collections.emptyMap(), + NOOP_TIMEOUT_CHECKER); Collection adjustedSnippets = grokPatternCreator.adjustForPunctuation(snippets); assertEquals("", grokPatternCreator.getOverallGrokPatternBuilder().toString()); @@ -346,18 +451,61 @@ public void testValidateFullLineGrokPatternGivenValid() { "559550912603512850\t2016-04-20T14:06:53\t2016-04-20T21:06:53Z\t8907014\tserv02nw01\t192.168.118.208\tAuthpriv\t" + "Info\tsshd\tsubsystem request for sftp"); + Map mappings = new HashMap<>(); + GrokPatternCreator grokPatternCreator = new GrokPatternCreator(explanation, sampleMessages, mappings, null, Collections.emptyMap(), + NOOP_TIMEOUT_CHECKER); + + grokPatternCreator.validateFullLineGrokPattern(grokPattern, timestampField); + assertEquals(9, mappings.size()); + assertEquals(Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, "long"), mappings.get("serial_no")); + Map expectedDateMapping = new HashMap<>(); + expectedDateMapping.put(FileStructureUtils.MAPPING_TYPE_SETTING, "date"); + expectedDateMapping.put(FileStructureUtils.MAPPING_FORMAT_SETTING, "iso8601"); + assertEquals(expectedDateMapping, mappings.get("local_timestamp")); + assertEquals(Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, "long"), mappings.get("user_id")); + assertEquals(Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, "keyword"), mappings.get("host")); + assertEquals(Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, "ip"), mappings.get("client_ip")); + assertEquals(Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, "keyword"), mappings.get("method")); + assertEquals(Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, "keyword"), mappings.get("severity")); + assertEquals(Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, "keyword"), mappings.get("program")); + assertEquals(Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, "keyword"), mappings.get("message")); + } + + public void testValidateFullLineGrokPatternGivenValidAndCustomDefinition() { + + String timestampField = "local_timestamp"; + String grokPattern = "%{INT:serial_no}\\t%{CUSTOM_TIMESTAMP:local_timestamp}\\t%{TIMESTAMP_ISO8601:utc_timestamp}\\t" + + "%{INT:user_id}\\t%{HOSTNAME:host}\\t%{IP:client_ip}\\t%{WORD:method}\\t%{LOGLEVEL:severity}\\t%{PROG:program}\\t" + + "%{GREEDYDATA:message}"; + + // Two timestamps: one local, one UTC + Collection sampleMessages = Arrays.asList( + "559550912540598297\t4/20/2016 2:06PM\t2016-04-20T21:06:53Z\t38545844\tserv02nw07\t192.168.114.28\tAuthpriv\t" + + "Info\tsshd\tsubsystem request for sftp", + "559550912548986880\t4/20/2016 2:06PM\t2016-04-20T21:06:53Z\t9049724\tserv02nw03\t10.120.48.147\tAuthpriv\t" + + "Info\tsshd\tsubsystem request for sftp", + "559550912548986887\t4/20/2016 2:06PM\t2016-04-20T21:06:53Z\t884343\tserv02tw03\t192.168.121.189\tAuthpriv\t" + + "Info\tsshd\tsubsystem request for sftp", + "559550912603512850\t4/20/2016 2:06PM\t2016-04-20T21:06:53Z\t8907014\tserv02nw01\t192.168.118.208\tAuthpriv\t" + + "Info\tsshd\tsubsystem request for sftp"); + Map mappings = new HashMap<>(); GrokPatternCreator grokPatternCreator = new GrokPatternCreator(explanation, sampleMessages, mappings, null, + Collections.singletonMap("CUSTOM_TIMESTAMP", "%{MONTHNUM}/%{MONTHDAY}/%{YEAR} %{HOUR}:%{MINUTE}(?:AM|PM)"), NOOP_TIMEOUT_CHECKER); grokPatternCreator.validateFullLineGrokPattern(grokPattern, timestampField); assertEquals(9, mappings.size()); assertEquals(Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, "long"), mappings.get("serial_no")); - assertEquals(Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, "date"), mappings.get("local_timestamp")); + Map expectedDateMapping = new HashMap<>(); + expectedDateMapping.put(FileStructureUtils.MAPPING_TYPE_SETTING, "date"); + expectedDateMapping.put(FileStructureUtils.MAPPING_FORMAT_SETTING, "iso8601"); + assertEquals(expectedDateMapping, mappings.get("utc_timestamp")); assertEquals(Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, "long"), mappings.get("user_id")); assertEquals(Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, "keyword"), mappings.get("host")); assertEquals(Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, "ip"), mappings.get("client_ip")); assertEquals(Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, "keyword"), mappings.get("method")); + assertEquals(Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, "keyword"), mappings.get("severity")); assertEquals(Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, "keyword"), mappings.get("program")); assertEquals(Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, "keyword"), mappings.get("message")); } @@ -376,7 +524,7 @@ public void testValidateFullLineGrokPatternGivenInvalid() { "Sep 8 11:55:42 linux named[22529]: error (unexpected RCODE REFUSED) resolving 'b.akamaiedge.net/A/IN': 95.110.64.205#53"); Map mappings = new HashMap<>(); - GrokPatternCreator grokPatternCreator = new GrokPatternCreator(explanation, sampleMessages, mappings, null, + GrokPatternCreator grokPatternCreator = new GrokPatternCreator(explanation, sampleMessages, mappings, null, Collections.emptyMap(), NOOP_TIMEOUT_CHECKER); IllegalArgumentException e = expectThrows(IllegalArgumentException.class, diff --git a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/TextLogFileStructureFinderTests.java b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/TextLogFileStructureFinderTests.java index 6cf4d61cf176c..6ac672f61780e 100644 --- a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/TextLogFileStructureFinderTests.java +++ b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/TextLogFileStructureFinderTests.java @@ -5,11 +5,9 @@ */ package org.elasticsearch.xpack.ml.filestructurefinder; -import org.elasticsearch.common.collect.Tuple; import org.elasticsearch.common.util.set.Sets; import org.elasticsearch.xpack.core.ml.filestructurefinder.FieldStats; import org.elasticsearch.xpack.core.ml.filestructurefinder.FileStructure; -import org.elasticsearch.xpack.ml.filestructurefinder.TimestampFormatFinder.TimestampMatch; import java.util.Collections; import java.util.Set; @@ -20,90 +18,6 @@ public class TextLogFileStructureFinderTests extends FileStructureTestCase { - private static final String EXCEPTION_TRACE_SAMPLE = - "[2018-02-28T14:49:40,517][DEBUG][o.e.a.b.TransportShardBulkAction] [an_index][2] failed to execute bulk item " + - "(index) BulkShardRequest [[an_index][2]] containing [33] requests\n" + - "java.lang.IllegalArgumentException: Document contains at least one immense term in field=\"message.keyword\" (whose UTF8 " + - "encoding is longer than the max length 32766), all of which were skipped. Please correct the analyzer to not produce " + - "such terms. The prefix of the first immense term is: '[60, 83, 79, 65, 80, 45, 69, 78, 86, 58, 69, 110, 118, 101, 108, " + - "111, 112, 101, 32, 120, 109, 108, 110, 115, 58, 83, 79, 65, 80, 45]...', original message: bytes can be at most 32766 " + - "in length; got 49023\n" + - "\tat org.apache.lucene.index.DefaultIndexingChain$PerField.invert(DefaultIndexingChain.java:796) " + - "~[lucene-core-7.2.1.jar:7.2.1 b2b6438b37073bee1fca40374e85bf91aa457c0b - ubuntu - 2018-01-10 00:48:43]\n" + - "\tat org.apache.lucene.index.DefaultIndexingChain.processField(DefaultIndexingChain.java:430) " + - "~[lucene-core-7.2.1.jar:7.2.1 b2b6438b37073bee1fca40374e85bf91aa457c0b - ubuntu - 2018-01-10 00:48:43]\n" + - "\tat org.apache.lucene.index.DefaultIndexingChain.processDocument(DefaultIndexingChain.java:392) " + - "~[lucene-core-7.2.1.jar:7.2.1 b2b6438b37073bee1fca40374e85bf91aa457c0b - ubuntu - 2018-01-10 00:48:43]\n" + - "\tat org.apache.lucene.index.DocumentsWriterPerThread.updateDocument(DocumentsWriterPerThread.java:240) " + - "~[lucene-core-7.2.1.jar:7.2.1 b2b6438b37073bee1fca40374e85bf91aa457c0b - ubuntu - 2018-01-10 00:48:43]\n" + - "\tat org.apache.lucene.index.DocumentsWriter.updateDocument(DocumentsWriter.java:496) " + - "~[lucene-core-7.2.1.jar:7.2.1 b2b6438b37073bee1fca40374e85bf91aa457c0b - ubuntu - 2018-01-10 00:48:43]\n" + - "\tat org.apache.lucene.index.IndexWriter.updateDocument(IndexWriter.java:1729) " + - "~[lucene-core-7.2.1.jar:7.2.1 b2b6438b37073bee1fca40374e85bf91aa457c0b - ubuntu - 2018-01-10 00:48:43]\n" + - "\tat org.apache.lucene.index.IndexWriter.addDocument(IndexWriter.java:1464) " + - "~[lucene-core-7.2.1.jar:7.2.1 b2b6438b37073bee1fca40374e85bf91aa457c0b - ubuntu - 2018-01-10 00:48:43]\n" + - "\tat org.elasticsearch.index.engine.InternalEngine.index(InternalEngine.java:1070) ~[elasticsearch-6.2.1.jar:6.2.1]\n" + - "\tat org.elasticsearch.index.engine.InternalEngine.indexIntoLucene(InternalEngine.java:1012) " + - "~[elasticsearch-6.2.1.jar:6.2.1]\n" + - "\tat org.elasticsearch.index.engine.InternalEngine.index(InternalEngine.java:878) ~[elasticsearch-6.2.1.jar:6.2.1]\n" + - "\tat org.elasticsearch.index.shard.IndexShard.index(IndexShard.java:738) ~[elasticsearch-6.2.1.jar:6.2.1]\n" + - "\tat org.elasticsearch.index.shard.IndexShard.applyIndexOperation(IndexShard.java:707) ~[elasticsearch-6.2.1.jar:6.2.1]\n" + - "\tat org.elasticsearch.index.shard.IndexShard.applyIndexOperationOnPrimary(IndexShard.java:673) " + - "~[elasticsearch-6.2.1.jar:6.2.1]\n" + - "\tat org.elasticsearch.action.bulk.TransportShardBulkAction.executeIndexRequestOnPrimary(TransportShardBulkAction.java:548) " + - "~[elasticsearch-6.2.1.jar:6.2.1]\n" + - "\tat org.elasticsearch.action.bulk.TransportShardBulkAction.executeIndexRequest(TransportShardBulkAction.java:140) " + - "[elasticsearch-6.2.1.jar:6.2.1]\n" + - "\tat org.elasticsearch.action.bulk.TransportShardBulkAction.executeBulkItemRequest(TransportShardBulkAction.java:236) " + - "[elasticsearch-6.2.1.jar:6.2.1]\n" + - "\tat org.elasticsearch.action.bulk.TransportShardBulkAction.performOnPrimary(TransportShardBulkAction.java:123) " + - "[elasticsearch-6.2.1.jar:6.2.1]\n" + - "\tat org.elasticsearch.action.bulk.TransportShardBulkAction.shardOperationOnPrimary(TransportShardBulkAction.java:110) " + - "[elasticsearch-6.2.1.jar:6.2.1]\n" + - "\tat org.elasticsearch.action.bulk.TransportShardBulkAction.shardOperationOnPrimary(TransportShardBulkAction.java:72) " + - "[elasticsearch-6.2.1.jar:6.2.1]\n" + - "\tat org.elasticsearch.action.support.replication.TransportReplicationAction$PrimaryShardReference.perform" + - "(TransportReplicationAction.java:1034) [elasticsearch-6.2.1.jar:6.2.1]\n" + - "\tat org.elasticsearch.action.support.replication.TransportReplicationAction$PrimaryShardReference.perform" + - "(TransportReplicationAction.java:1012) [elasticsearch-6.2.1.jar:6.2.1]\n" + - "\tat org.elasticsearch.action.support.replication.ReplicationOperation.execute(ReplicationOperation.java:103) " + - "[elasticsearch-6.2.1.jar:6.2.1]\n" + - "\tat org.elasticsearch.action.support.replication.TransportReplicationAction$AsyncPrimaryAction.onResponse" + - "(TransportReplicationAction.java:359) [elasticsearch-6.2.1.jar:6.2.1]\n" + - "\tat org.elasticsearch.action.support.replication.TransportReplicationAction$AsyncPrimaryAction.onResponse" + - "(TransportReplicationAction.java:299) [elasticsearch-6.2.1.jar:6.2.1]\n" + - "\tat org.elasticsearch.action.support.replication.TransportReplicationAction$1.onResponse" + - "(TransportReplicationAction.java:975) [elasticsearch-6.2.1.jar:6.2.1]\n" + - "\tat org.elasticsearch.action.support.replication.TransportReplicationAction$1.onResponse" + - "(TransportReplicationAction.java:972) [elasticsearch-6.2.1.jar:6.2.1]\n" + - "\tat org.elasticsearch.index.shard.IndexShardOperationPermits.acquire(IndexShardOperationPermits.java:238) " + - "[elasticsearch-6.2.1.jar:6.2.1]\n" + - "\tat org.elasticsearch.index.shard.IndexShard.acquirePrimaryOperationPermit(IndexShard.java:2220) " + - "[elasticsearch-6.2.1.jar:6.2.1]\n" + - "\tat org.elasticsearch.action.support.replication.TransportReplicationAction.acquirePrimaryShardReference" + - "(TransportReplicationAction.java:984) [elasticsearch-6.2.1.jar:6.2.1]\n" + - "\tat org.elasticsearch.action.support.replication.TransportReplicationAction.access$500(TransportReplicationAction.java:98) " + - "[elasticsearch-6.2.1.jar:6.2.1]\n" + - "\tat org.elasticsearch.action.support.replication.TransportReplicationAction$AsyncPrimaryAction.doRun" + - "(TransportReplicationAction.java:320) [elasticsearch-6.2.1.jar:6.2.1]\n" + - "\tat org.elasticsearch.common.util.concurrent.AbstractRunnable.run(AbstractRunnable.java:37) " + - "[elasticsearch-6.2.1.jar:6.2.1]\n" + - "\tat org.elasticsearch.action.support.replication.TransportReplicationAction$PrimaryOperationTransportHandler" + - ".messageReceived(TransportReplicationAction.java:295) [elasticsearch-6.2.1.jar:6.2.1]\n" + - "\tat org.elasticsearch.action.support.replication.TransportReplicationAction$PrimaryOperationTransportHandler" + - ".messageReceived(TransportReplicationAction.java:282) [elasticsearch-6.2.1.jar:6.2.1]\n" + - "\tat org.elasticsearch.transport.RequestHandlerRegistry.processMessageReceived(RequestHandlerRegistry.java:66) " + - "[elasticsearch-6.2.1.jar:6.2.1]\n" + - "\tat org.elasticsearch.transport.TransportService$7.doRun(TransportService.java:656) " + - "[elasticsearch-6.2.1.jar:6.2.1]\n" + - "\tat org.elasticsearch.common.util.concurrent.ThreadContext$ContextPreservingAbstractRunnable.doRun(ThreadContext.java:635) " + - "[elasticsearch-6.2.1.jar:6.2.1]\n" + - "\tat org.elasticsearch.common.util.concurrent.AbstractRunnable.run(AbstractRunnable.java:37) " + - "[elasticsearch-6.2.1.jar:6.2.1]\n" + - "\tat java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) [?:1.8.0_144]\n" + - "\tat java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) [?:1.8.0_144]\n" + - "\tat java.lang.Thread.run(Thread.java:748) [?:1.8.0_144]\n"; - private FileStructureFinderFactory factory = new TextLogFileStructureFinderFactory(); public void testCreateConfigsGivenElasticsearchLog() throws Exception { @@ -124,7 +38,7 @@ public void testCreateConfigsGivenElasticsearchLog() throws Exception { assertEquals(hasByteOrderMarker, structure.getHasByteOrderMarker()); } assertNull(structure.getExcludeLinesPattern()); - assertEquals("^\\[\\b\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2},\\d{3}", structure.getMultilineStartPattern()); + assertEquals("^\\[\\b\\d{4}-\\d{2}-\\d{2}[T ]\\d{2}:\\d{2}", structure.getMultilineStartPattern()); assertNull(structure.getDelimiter()); assertNull(structure.getQuote()); assertNull(structure.getHasHeaderRow()); @@ -139,6 +53,47 @@ public void testCreateConfigsGivenElasticsearchLog() throws Exception { } } + public void testCreateConfigsGivenElasticsearchLogAndTimestampFormatOverride() throws Exception { + + String sample = "12/31/2018 1:40PM INFO foo\n" + + "1/31/2019 11:40AM DEBUG bar\n" + + "2/1/2019 11:00PM INFO foo\n" + + "2/2/2019 1:23AM DEBUG bar\n"; + + FileStructureOverrides overrides = FileStructureOverrides.builder().setTimestampFormat("M/d/yyyy h:mma").build(); + + assertTrue(factory.canCreateFromSample(explanation, sample)); + + String charset = randomFrom(POSSIBLE_CHARSETS); + Boolean hasByteOrderMarker = randomHasByteOrderMarker(charset); + FileStructureFinder structureFinder = factory.createFromSample(explanation, sample, charset, hasByteOrderMarker, overrides, + NOOP_TIMEOUT_CHECKER); + + FileStructure structure = structureFinder.getStructure(); + + assertEquals(FileStructure.Format.SEMI_STRUCTURED_TEXT, structure.getFormat()); + assertEquals(charset, structure.getCharset()); + if (hasByteOrderMarker == null) { + assertNull(structure.getHasByteOrderMarker()); + } else { + assertEquals(hasByteOrderMarker, structure.getHasByteOrderMarker()); + } + assertNull(structure.getExcludeLinesPattern()); + assertEquals("^\\d{1,2}/\\d{1,2}/\\d{4} \\d{1,2}:\\d{2}[AP]M\\b", structure.getMultilineStartPattern()); + assertNull(structure.getDelimiter()); + assertNull(structure.getQuote()); + assertNull(structure.getHasHeaderRow()); + assertNull(structure.getShouldTrimFields()); + assertEquals("%{CUSTOM_TIMESTAMP:timestamp} %{LOGLEVEL:loglevel} .*", structure.getGrokPattern()); + assertEquals("timestamp", structure.getTimestampField()); + assertEquals(Collections.singletonList("M/d/YYYY h:mma"), structure.getJodaTimestampFormats()); + FieldStats messageFieldStats = structure.getFieldStats().get("message"); + assertNotNull(messageFieldStats); + for (String statMessage : messageFieldStats.getTopHits().stream().map(m -> (String) m.get("value")).collect(Collectors.toList())) { + assertThat(structureFinder.getSampleMessages(), hasItem(statMessage)); + } + } + public void testCreateConfigsGivenElasticsearchLogAndTimestampFieldOverride() throws Exception { FileStructureOverrides overrides = FileStructureOverrides.builder().setTimestampField("my_time").build(); @@ -160,7 +115,7 @@ public void testCreateConfigsGivenElasticsearchLogAndTimestampFieldOverride() th assertEquals(hasByteOrderMarker, structure.getHasByteOrderMarker()); } assertNull(structure.getExcludeLinesPattern()); - assertEquals("^\\[\\b\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2},\\d{3}", structure.getMultilineStartPattern()); + assertEquals("^\\[\\b\\d{4}-\\d{2}-\\d{2}[T ]\\d{2}:\\d{2}", structure.getMultilineStartPattern()); assertNull(structure.getDelimiter()); assertNull(structure.getQuote()); assertNull(structure.getHasHeaderRow()); @@ -197,7 +152,7 @@ public void testCreateConfigsGivenElasticsearchLogAndGrokPatternOverride() throw assertEquals(hasByteOrderMarker, structure.getHasByteOrderMarker()); } assertNull(structure.getExcludeLinesPattern()); - assertEquals("^\\[\\b\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2},\\d{3}", structure.getMultilineStartPattern()); + assertEquals("^\\[\\b\\d{4}-\\d{2}-\\d{2}[T ]\\d{2}:\\d{2}", structure.getMultilineStartPattern()); assertNull(structure.getDelimiter()); assertNull(structure.getQuote()); assertNull(structure.getHasHeaderRow()); @@ -303,98 +258,4 @@ public void testCreateMultiLineMessageStartRegexGivenManyPrefacesIncludingEmpty( TextLogFileStructureFinder.createMultiLineMessageStartRegex(prefaces, simpleDateRegex)); } } - - public void testMostLikelyTimestampGivenAllSame() { - String sample = "[2018-06-27T11:59:22,125][INFO ][o.e.n.Node ] [node-0] initializing ...\n" + - "[2018-06-27T11:59:22,201][INFO ][o.e.e.NodeEnvironment ] [node-0] using [1] data paths, mounts [[/ (/dev/disk1)]], " + - "net usable_space [216.1gb], net total_space [464.7gb], types [hfs]\n" + - "[2018-06-27T11:59:22,202][INFO ][o.e.e.NodeEnvironment ] [node-0] heap size [494.9mb], " + - "compressed ordinary object pointers [true]\n" + - "[2018-06-27T11:59:22,204][INFO ][o.e.n.Node ] [node-0] node name [node-0], node ID [Ha1gD8nNSDqjd6PIyu3DJA]\n" + - "[2018-06-27T11:59:22,204][INFO ][o.e.n.Node ] [node-0] version[6.4.0-SNAPSHOT], pid[2785], " + - "build[default/zip/3c60efa/2018-06-26T14:55:15.206676Z], OS[Mac OS X/10.12.6/x86_64], " + - "JVM[\"Oracle Corporation\"/Java HotSpot(TM) 64-Bit Server VM/10/10+46]\n" + - "[2018-06-27T11:59:22,205][INFO ][o.e.n.Node ] [node-0] JVM arguments [-Xms1g, -Xmx1g, " + - "-XX:+UseConcMarkSweepGC, -XX:CMSInitiatingOccupancyFraction=75, -XX:+UseCMSInitiatingOccupancyOnly, " + - "-XX:+AlwaysPreTouch, -Xss1m, -Djava.awt.headless=true, -Dfile.encoding=UTF-8, -Djna.nosys=true, " + - "-XX:-OmitStackTraceInFastThrow, -Dio.netty.noUnsafe=true, -Dio.netty.noKeySetOptimization=true, " + - "-Dio.netty.recycler.maxCapacityPerThread=0, -Dlog4j.shutdownHookEnabled=false, -Dlog4j2.disable.jmx=true, " + - "-Djava.io.tmpdir=/var/folders/k5/5sqcdlps5sg3cvlp783gcz740000h0/T/elasticsearch.nFUyeMH1, " + - "-XX:+HeapDumpOnOutOfMemoryError, -XX:HeapDumpPath=data, -XX:ErrorFile=logs/hs_err_pid%p.log, " + - "-Xlog:gc*,gc+age=trace,safepoint:file=logs/gc.log:utctime,pid,tags:filecount=32,filesize=64m, " + - "-Djava.locale.providers=COMPAT, -Dio.netty.allocator.type=unpooled, -ea, -esa, -Xms512m, -Xmx512m, " + - "-Des.path.home=/Users/dave/elasticsearch/distribution/build/cluster/run node0/elasticsearch-6.4.0-SNAPSHOT, " + - "-Des.path.conf=/Users/dave/elasticsearch/distribution/build/cluster/run node0/elasticsearch-6.4.0-SNAPSHOT/config, " + - "-Des.distribution.flavor=default, -Des.distribution.type=zip]\n" + - "[2018-06-27T11:59:22,205][WARN ][o.e.n.Node ] [node-0] version [6.4.0-SNAPSHOT] is a pre-release version of " + - "Elasticsearch and is not suitable for production\n" + - "[2018-06-27T11:59:23,585][INFO ][o.e.p.PluginsService ] [node-0] loaded module [aggs-matrix-stats]\n" + - "[2018-06-27T11:59:23,586][INFO ][o.e.p.PluginsService ] [node-0] loaded module [analysis-common]\n" + - "[2018-06-27T11:59:23,586][INFO ][o.e.p.PluginsService ] [node-0] loaded module [ingest-common]\n" + - "[2018-06-27T11:59:23,586][INFO ][o.e.p.PluginsService ] [node-0] loaded module [lang-expression]\n" + - "[2018-06-27T11:59:23,586][INFO ][o.e.p.PluginsService ] [node-0] loaded module [lang-mustache]\n" + - "[2018-06-27T11:59:23,586][INFO ][o.e.p.PluginsService ] [node-0] loaded module [lang-painless]\n" + - "[2018-06-27T11:59:23,586][INFO ][o.e.p.PluginsService ] [node-0] loaded module [mapper-extras]\n" + - "[2018-06-27T11:59:23,586][INFO ][o.e.p.PluginsService ] [node-0] loaded module [parent-join]\n" + - "[2018-06-27T11:59:23,586][INFO ][o.e.p.PluginsService ] [node-0] loaded module [percolator]\n" + - "[2018-06-27T11:59:23,586][INFO ][o.e.p.PluginsService ] [node-0] loaded module [rank-eval]\n" + - "[2018-06-27T11:59:23,586][INFO ][o.e.p.PluginsService ] [node-0] loaded module [reindex]\n" + - "[2018-06-27T11:59:23,586][INFO ][o.e.p.PluginsService ] [node-0] loaded module [repository-url]\n" + - "[2018-06-27T11:59:23,587][INFO ][o.e.p.PluginsService ] [node-0] loaded module [transport-netty4]\n" + - "[2018-06-27T11:59:23,587][INFO ][o.e.p.PluginsService ] [node-0] loaded module [x-pack-core]\n" + - "[2018-06-27T11:59:23,587][INFO ][o.e.p.PluginsService ] [node-0] loaded module [x-pack-deprecation]\n" + - "[2018-06-27T11:59:23,587][INFO ][o.e.p.PluginsService ] [node-0] loaded module [x-pack-graph]\n" + - "[2018-06-27T11:59:23,587][INFO ][o.e.p.PluginsService ] [node-0] loaded module [x-pack-logstash]\n" + - "[2018-06-27T11:59:23,587][INFO ][o.e.p.PluginsService ] [node-0] loaded module [x-pack-ml]\n" + - "[2018-06-27T11:59:23,587][INFO ][o.e.p.PluginsService ] [node-0] loaded module [x-pack-monitoring]\n" + - "[2018-06-27T11:59:23,587][INFO ][o.e.p.PluginsService ] [node-0] loaded module [x-pack-rollup]\n" + - "[2018-06-27T11:59:23,587][INFO ][o.e.p.PluginsService ] [node-0] loaded module [x-pack-security]\n" + - "[2018-06-27T11:59:23,587][INFO ][o.e.p.PluginsService ] [node-0] loaded module [x-pack-sql]\n" + - "[2018-06-27T11:59:23,588][INFO ][o.e.p.PluginsService ] [node-0] loaded module [x-pack-upgrade]\n" + - "[2018-06-27T11:59:23,588][INFO ][o.e.p.PluginsService ] [node-0] loaded module [x-pack-watcher]\n" + - "[2018-06-27T11:59:23,588][INFO ][o.e.p.PluginsService ] [node-0] no plugins loaded\n"; - - Tuple> mostLikelyMatch = - TextLogFileStructureFinder.mostLikelyTimestamp(sample.split("\n"), FileStructureOverrides.EMPTY_OVERRIDES, - NOOP_TIMEOUT_CHECKER); - assertNotNull(mostLikelyMatch); - assertEquals(new TimestampMatch(9, "", "ISO8601", "yyyy-MM-dd'T'HH:mm:ss,SSS", - "\\b\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2},\\d{3}", "TIMESTAMP_ISO8601", ""), mostLikelyMatch.v1()); - } - - public void testMostLikelyTimestampGivenExceptionTrace() { - - Tuple> mostLikelyMatch = - TextLogFileStructureFinder.mostLikelyTimestamp(EXCEPTION_TRACE_SAMPLE.split("\n"), FileStructureOverrides.EMPTY_OVERRIDES, - NOOP_TIMEOUT_CHECKER); - assertNotNull(mostLikelyMatch); - - // Even though many lines have a timestamp near the end (in the Lucene version information), - // these are so far along the lines that the weight of the timestamp near the beginning of the - // first line should take precedence - assertEquals(new TimestampMatch(9, "", "ISO8601", "yyyy-MM-dd'T'HH:mm:ss,SSS", - "\\b\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2},\\d{3}", "TIMESTAMP_ISO8601", ""), mostLikelyMatch.v1()); - } - - public void testMostLikelyTimestampGivenExceptionTraceAndTimestampFormatOverride() { - - FileStructureOverrides overrides = FileStructureOverrides.builder().setTimestampFormat("yyyy-MM-dd HH:mm:ss").build(); - - Tuple> mostLikelyMatch = - TextLogFileStructureFinder.mostLikelyTimestamp(EXCEPTION_TRACE_SAMPLE.split("\n"), overrides, NOOP_TIMEOUT_CHECKER); - assertNotNull(mostLikelyMatch); - - // The override should force the seemingly inferior choice of timestamp - assertEquals(new TimestampMatch(6, "", "YYYY-MM-dd HH:mm:ss", "yyyy-MM-dd HH:mm:ss", "\\b\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2}", - "TIMESTAMP_ISO8601", ""), mostLikelyMatch.v1()); - } - - public void testMostLikelyTimestampGivenExceptionTraceAndImpossibleTimestampFormatOverride() { - - FileStructureOverrides overrides = FileStructureOverrides.builder().setTimestampFormat("MMM dd HH:mm:ss").build(); - - Tuple> mostLikelyMatch = - TextLogFileStructureFinder.mostLikelyTimestamp(EXCEPTION_TRACE_SAMPLE.split("\n"), overrides, NOOP_TIMEOUT_CHECKER); - assertNull(mostLikelyMatch); - } } diff --git a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/TimestampFormatFinderTests.java b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/TimestampFormatFinderTests.java index 0374ed6f34175..b80e8a5712aaa 100644 --- a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/TimestampFormatFinderTests.java +++ b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/filestructurefinder/TimestampFormatFinderTests.java @@ -6,9 +6,7 @@ package org.elasticsearch.xpack.ml.filestructurefinder; import org.elasticsearch.common.collect.Tuple; -import org.elasticsearch.common.joda.Joda; import org.elasticsearch.common.time.DateFormatter; -import org.elasticsearch.xpack.ml.filestructurefinder.TimestampFormatFinder.TimestampMatch; import java.time.Instant; import java.time.ZoneId; @@ -18,306 +16,981 @@ import java.time.temporal.TemporalAccessor; import java.time.temporal.TemporalQueries; import java.util.Arrays; +import java.util.BitSet; +import java.util.Collections; import java.util.List; import java.util.Locale; +import java.util.Map; +import java.util.regex.Pattern; public class TimestampFormatFinderTests extends FileStructureTestCase { - public void testFindFirstMatchGivenNoMatch() { - - assertNull(TimestampFormatFinder.findFirstMatch("", NOOP_TIMEOUT_CHECKER)); - assertNull(TimestampFormatFinder.findFirstMatch("no timestamps in here", NOOP_TIMEOUT_CHECKER)); - assertNull(TimestampFormatFinder.findFirstMatch(":::", NOOP_TIMEOUT_CHECKER)); - assertNull(TimestampFormatFinder.findFirstMatch("/+", NOOP_TIMEOUT_CHECKER)); - } - - public void testFindFirstMatchGivenOnlyIso8601() { - - validateTimestampMatch(new TimestampMatch(7, "", "ISO8601", "yyyy-MM-dd'T'HH:mm:ss,SSSXX", - "\\b\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2},\\d{3}", "TIMESTAMP_ISO8601", ""), "2018-05-15T16:14:56,374Z", - 1526400896374L); - validateTimestampMatch(new TimestampMatch(7, "", "ISO8601", "yyyy-MM-dd'T'HH:mm:ss,SSSXX", - "\\b\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2},\\d{3}", "TIMESTAMP_ISO8601", ""), "2018-05-15T17:14:56,374+0100", - 1526400896374L); - validateTimestampMatch(new TimestampMatch(8, "", "ISO8601", "yyyy-MM-dd'T'HH:mm:ss,SSSXXX", - "\\b\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2},\\d{3}", "TIMESTAMP_ISO8601", ""), "2018-05-15T17:14:56,374+01:00", - 1526400896374L); - validateTimestampMatch(new TimestampMatch(9, "", "ISO8601", "yyyy-MM-dd'T'HH:mm:ss,SSS", - "\\b\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2},\\d{3}", "TIMESTAMP_ISO8601", ""), "2018-05-15T17:14:56,374", 1526400896374L); - - TimestampMatch pureIso8601Expected = new TimestampMatch(10, "", "ISO8601", "ISO8601", - "\\b\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}", "TIMESTAMP_ISO8601", ""); - - validateTimestampMatch(pureIso8601Expected, "2018-05-15T16:14:56Z", 1526400896000L); - validateTimestampMatch(pureIso8601Expected, "2018-05-15T17:14:56+0100", 1526400896000L); - validateTimestampMatch(pureIso8601Expected, "2018-05-15T17:14:56+01:00", 1526400896000L); - validateTimestampMatch(pureIso8601Expected, "2018-05-15T17:14:56", 1526400896000L); - - validateTimestampMatch(new TimestampMatch(1, "", "YYYY-MM-dd HH:mm:ss,SSSZ", "yyyy-MM-dd HH:mm:ss,SSSXX", - "\\b\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2},\\d{3}", "TIMESTAMP_ISO8601", ""), "2018-05-15 16:14:56,374Z", - 1526400896374L); - validateTimestampMatch(new TimestampMatch(1, "", "YYYY-MM-dd HH:mm:ss,SSSZ", "yyyy-MM-dd HH:mm:ss,SSSXX", - "\\b\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2},\\d{3}", "TIMESTAMP_ISO8601", ""), "2018-05-15 17:14:56,374+0100", - 1526400896374L); - validateTimestampMatch(new TimestampMatch(2, "", "YYYY-MM-dd HH:mm:ss,SSSZZ", "yyyy-MM-dd HH:mm:ss,SSSXXX", - "\\b\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2},\\d{3}", "TIMESTAMP_ISO8601", ""), "2018-05-15 17:14:56,374+01:00", - 1526400896374L); - validateTimestampMatch(new TimestampMatch(3, "", "YYYY-MM-dd HH:mm:ss,SSS", "yyyy-MM-dd HH:mm:ss,SSS", - "\\b\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2},\\d{3}", "TIMESTAMP_ISO8601", ""), "2018-05-15 17:14:56,374", 1526400896374L); - validateTimestampMatch(new TimestampMatch(4, "", "YYYY-MM-dd HH:mm:ssZ", "yyyy-MM-dd HH:mm:ssXX", - "\\b\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2}", "TIMESTAMP_ISO8601", ""), "2018-05-15 16:14:56Z", 1526400896000L); - validateTimestampMatch(new TimestampMatch(4, "", "YYYY-MM-dd HH:mm:ssZ", "yyyy-MM-dd HH:mm:ssXX", - "\\b\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2}", "TIMESTAMP_ISO8601", ""), "2018-05-15 17:14:56+0100", 1526400896000L); - validateTimestampMatch(new TimestampMatch(5, "", "YYYY-MM-dd HH:mm:ssZZ", "yyyy-MM-dd HH:mm:ssXXX", - "\\b\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2}", "TIMESTAMP_ISO8601", ""), "2018-05-15 17:14:56+01:00", 1526400896000L); - validateTimestampMatch(new TimestampMatch(6, "", "YYYY-MM-dd HH:mm:ss", "yyyy-MM-dd HH:mm:ss", - "\\b\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2}", "TIMESTAMP_ISO8601", ""), "2018-05-15 17:14:56", 1526400896000L); - } - - public void testFindFirstMatchGivenOnlyKnownTimestampFormat() { + private static final String EXCEPTION_TRACE_SAMPLE = + "[2018-02-28T14:49:40,517][DEBUG][o.e.a.b.TransportShardBulkAction] [an_index][2] failed to execute bulk item " + + "(index) BulkShardRequest [[an_index][2]] containing [33] requests\n" + + "java.lang.IllegalArgumentException: Document contains at least one immense term in field=\"message.keyword\" (whose UTF8 " + + "encoding is longer than the max length 32766), all of which were skipped. Please correct the analyzer to not produce " + + "such terms. The prefix of the first immense term is: '[60, 83, 79, 65, 80, 45, 69, 78, 86, 58, 69, 110, 118, 101, 108, " + + "111, 112, 101, 32, 120, 109, 108, 110, 115, 58, 83, 79, 65, 80, 45]...', original message: bytes can be at most 32766 " + + "in length; got 49023\n" + + "\tat org.apache.lucene.index.DefaultIndexingChain$PerField.invert(DefaultIndexingChain.java:796) " + + "~[lucene-core-7.2.1.jar:7.2.1 b2b6438b37073bee1fca40374e85bf91aa457c0b - ubuntu - 2018-01-10 00:48:43]\n" + + "\tat org.apache.lucene.index.DefaultIndexingChain.processField(DefaultIndexingChain.java:430) " + + "~[lucene-core-7.2.1.jar:7.2.1 b2b6438b37073bee1fca40374e85bf91aa457c0b - ubuntu - 2018-01-10 00:48:43]\n" + + "\tat org.apache.lucene.index.DefaultIndexingChain.processDocument(DefaultIndexingChain.java:392) " + + "~[lucene-core-7.2.1.jar:7.2.1 b2b6438b37073bee1fca40374e85bf91aa457c0b - ubuntu - 2018-01-10 00:48:43]\n" + + "\tat org.apache.lucene.index.DocumentsWriterPerThread.updateDocument(DocumentsWriterPerThread.java:240) " + + "~[lucene-core-7.2.1.jar:7.2.1 b2b6438b37073bee1fca40374e85bf91aa457c0b - ubuntu - 2018-01-10 00:48:43]\n" + + "\tat org.apache.lucene.index.DocumentsWriter.updateDocument(DocumentsWriter.java:496) " + + "~[lucene-core-7.2.1.jar:7.2.1 b2b6438b37073bee1fca40374e85bf91aa457c0b - ubuntu - 2018-01-10 00:48:43]\n" + + "\tat org.apache.lucene.index.IndexWriter.updateDocument(IndexWriter.java:1729) " + + "~[lucene-core-7.2.1.jar:7.2.1 b2b6438b37073bee1fca40374e85bf91aa457c0b - ubuntu - 2018-01-10 00:48:43]\n" + + "\tat org.apache.lucene.index.IndexWriter.addDocument(IndexWriter.java:1464) " + + "~[lucene-core-7.2.1.jar:7.2.1 b2b6438b37073bee1fca40374e85bf91aa457c0b - ubuntu - 2018-01-10 00:48:43]\n" + + "\tat org.elasticsearch.index.engine.InternalEngine.index(InternalEngine.java:1070) ~[elasticsearch-6.2.1.jar:6.2.1]\n" + + "\tat org.elasticsearch.index.engine.InternalEngine.indexIntoLucene(InternalEngine.java:1012) " + + "~[elasticsearch-6.2.1.jar:6.2.1]\n" + + "\tat org.elasticsearch.index.engine.InternalEngine.index(InternalEngine.java:878) ~[elasticsearch-6.2.1.jar:6.2.1]\n" + + "\tat org.elasticsearch.index.shard.IndexShard.index(IndexShard.java:738) ~[elasticsearch-6.2.1.jar:6.2.1]\n" + + "\tat org.elasticsearch.index.shard.IndexShard.applyIndexOperation(IndexShard.java:707) ~[elasticsearch-6.2.1.jar:6.2.1]\n" + + "\tat org.elasticsearch.index.shard.IndexShard.applyIndexOperationOnPrimary(IndexShard.java:673) " + + "~[elasticsearch-6.2.1.jar:6.2.1]\n" + + "\tat org.elasticsearch.action.bulk.TransportShardBulkAction.executeIndexRequestOnPrimary(TransportShardBulkAction.java:548) " + + "~[elasticsearch-6.2.1.jar:6.2.1]\n" + + "\tat org.elasticsearch.action.bulk.TransportShardBulkAction.executeIndexRequest(TransportShardBulkAction.java:140) " + + "[elasticsearch-6.2.1.jar:6.2.1]\n" + + "\tat org.elasticsearch.action.bulk.TransportShardBulkAction.executeBulkItemRequest(TransportShardBulkAction.java:236) " + + "[elasticsearch-6.2.1.jar:6.2.1]\n" + + "\tat org.elasticsearch.action.bulk.TransportShardBulkAction.performOnPrimary(TransportShardBulkAction.java:123) " + + "[elasticsearch-6.2.1.jar:6.2.1]\n" + + "\tat org.elasticsearch.action.bulk.TransportShardBulkAction.shardOperationOnPrimary(TransportShardBulkAction.java:110) " + + "[elasticsearch-6.2.1.jar:6.2.1]\n" + + "\tat org.elasticsearch.action.bulk.TransportShardBulkAction.shardOperationOnPrimary(TransportShardBulkAction.java:72) " + + "[elasticsearch-6.2.1.jar:6.2.1]\n" + + "\tat org.elasticsearch.action.support.replication.TransportReplicationAction$PrimaryShardReference.perform" + + "(TransportReplicationAction.java:1034) [elasticsearch-6.2.1.jar:6.2.1]\n" + + "\tat org.elasticsearch.action.support.replication.TransportReplicationAction$PrimaryShardReference.perform" + + "(TransportReplicationAction.java:1012) [elasticsearch-6.2.1.jar:6.2.1]\n" + + "\tat org.elasticsearch.action.support.replication.ReplicationOperation.execute(ReplicationOperation.java:103) " + + "[elasticsearch-6.2.1.jar:6.2.1]\n" + + "\tat org.elasticsearch.action.support.replication.TransportReplicationAction$AsyncPrimaryAction.onResponse" + + "(TransportReplicationAction.java:359) [elasticsearch-6.2.1.jar:6.2.1]\n" + + "\tat org.elasticsearch.action.support.replication.TransportReplicationAction$AsyncPrimaryAction.onResponse" + + "(TransportReplicationAction.java:299) [elasticsearch-6.2.1.jar:6.2.1]\n" + + "\tat org.elasticsearch.action.support.replication.TransportReplicationAction$1.onResponse" + + "(TransportReplicationAction.java:975) [elasticsearch-6.2.1.jar:6.2.1]\n" + + "\tat org.elasticsearch.action.support.replication.TransportReplicationAction$1.onResponse" + + "(TransportReplicationAction.java:972) [elasticsearch-6.2.1.jar:6.2.1]\n" + + "\tat org.elasticsearch.index.shard.IndexShardOperationPermits.acquire(IndexShardOperationPermits.java:238) " + + "[elasticsearch-6.2.1.jar:6.2.1]\n" + + "\tat org.elasticsearch.index.shard.IndexShard.acquirePrimaryOperationPermit(IndexShard.java:2220) " + + "[elasticsearch-6.2.1.jar:6.2.1]\n" + + "\tat org.elasticsearch.action.support.replication.TransportReplicationAction.acquirePrimaryShardReference" + + "(TransportReplicationAction.java:984) [elasticsearch-6.2.1.jar:6.2.1]\n" + + "\tat org.elasticsearch.action.support.replication.TransportReplicationAction.access$500(TransportReplicationAction.java:98) " + + "[elasticsearch-6.2.1.jar:6.2.1]\n" + + "\tat org.elasticsearch.action.support.replication.TransportReplicationAction$AsyncPrimaryAction.doRun" + + "(TransportReplicationAction.java:320) [elasticsearch-6.2.1.jar:6.2.1]\n" + + "\tat org.elasticsearch.common.util.concurrent.AbstractRunnable.run(AbstractRunnable.java:37) " + + "[elasticsearch-6.2.1.jar:6.2.1]\n" + + "\tat org.elasticsearch.action.support.replication.TransportReplicationAction$PrimaryOperationTransportHandler" + + ".messageReceived(TransportReplicationAction.java:295) [elasticsearch-6.2.1.jar:6.2.1]\n" + + "\tat org.elasticsearch.action.support.replication.TransportReplicationAction$PrimaryOperationTransportHandler" + + ".messageReceived(TransportReplicationAction.java:282) [elasticsearch-6.2.1.jar:6.2.1]\n" + + "\tat org.elasticsearch.transport.RequestHandlerRegistry.processMessageReceived(RequestHandlerRegistry.java:66) " + + "[elasticsearch-6.2.1.jar:6.2.1]\n" + + "\tat org.elasticsearch.transport.TransportService$7.doRun(TransportService.java:656) " + + "[elasticsearch-6.2.1.jar:6.2.1]\n" + + "\tat org.elasticsearch.common.util.concurrent.ThreadContext$ContextPreservingAbstractRunnable.doRun(ThreadContext.java:635) " + + "[elasticsearch-6.2.1.jar:6.2.1]\n" + + "\tat org.elasticsearch.common.util.concurrent.AbstractRunnable.run(AbstractRunnable.java:37) " + + "[elasticsearch-6.2.1.jar:6.2.1]\n" + + "\tat java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) [?:1.8.0_144]\n" + + "\tat java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) [?:1.8.0_144]\n" + + "\tat java.lang.Thread.run(Thread.java:748) [?:1.8.0_144]\n"; + + public void testValidOverrideFormatToGrokAndRegex() { + + assertEquals(new Tuple<>("%{YEAR}-%{MONTHNUM2}-%{MONTHDAY}T%{HOUR}:%{MINUTE}:%{SECOND}%{ISO8601_TIMEZONE}", + "\\b\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2},\\d{3}(?:Z|[+-]\\d{4})\\b"), + TimestampFormatFinder.overrideFormatToGrokAndRegex("yyyy-MM-dd'T'HH:mm:ss,SSSXX")); + assertEquals(new Tuple<>("%{MONTHDAY}\\.%{MONTHNUM2}\\.%{YEAR} %{HOUR}:%{MINUTE} (?:AM|PM)", + "\\b\\d{2}\\.\\d{2}\\.\\d{2} \\d{1,2}:\\d{2} [AP]M\\b"), + TimestampFormatFinder.overrideFormatToGrokAndRegex("dd.MM.yy h:mm a")); + assertEquals(new Tuple<>("%{MONTHNUM2}/%{MONTHDAY}/%{YEAR} %{HOUR}:%{MINUTE}:%{SECOND} %{TZ}", + "\\b\\d{2}/\\d{2}/\\d{4} \\d{1,2}:\\d{2}:\\d{2} [A-Z]{3}\\b"), + TimestampFormatFinder.overrideFormatToGrokAndRegex("MM/dd/yyyy H:mm:ss zzz")); + } + + public void testInvalidOverrideFormatToGrokAndRegex() { + + IllegalArgumentException e = expectThrows(IllegalArgumentException.class, + () -> TimestampFormatFinder.overrideFormatToGrokAndRegex("MM/dd/yyyy\nH:mm:ss zzz")); + assertEquals("Multi-line timestamp formats [MM/dd/yyyy\nH:mm:ss zzz] not supported", e.getMessage()); + e = expectThrows(IllegalArgumentException.class, + () -> TimestampFormatFinder.overrideFormatToGrokAndRegex("MM/dd/YYYY H:mm:ss zzz")); + assertEquals("Letter group [YYYY] in [MM/dd/YYYY H:mm:ss zzz] is not supported", e.getMessage()); + e = expectThrows(IllegalArgumentException.class, + () -> TimestampFormatFinder.overrideFormatToGrokAndRegex("MM/dd/yyy H:mm:ss zzz")); + assertEquals("Letter group [yyy] in [MM/dd/yyy H:mm:ss zzz] is not supported", e.getMessage()); + e = expectThrows(IllegalArgumentException.class, + () -> TimestampFormatFinder.overrideFormatToGrokAndRegex("MM/dd/yyyy H:mm:ss+SSSSSS")); + assertEquals("Letter group [SSSSSS] in [MM/dd/yyyy H:mm:ss+SSSSSS] is not supported" + + " because it is not preceeded by [ss] and a separator from [:.,]", e.getMessage()); + e = expectThrows(IllegalArgumentException.class, + () -> TimestampFormatFinder.overrideFormatToGrokAndRegex("MM/dd/yyyy H:mm,SSSSSS")); + assertEquals("Letter group [SSSSSS] in [MM/dd/yyyy H:mm,SSSSSS] is not supported" + + " because it is not preceeded by [ss] and a separator from [:.,]", e.getMessage()); + e = expectThrows(IllegalArgumentException.class, + () -> TimestampFormatFinder.overrideFormatToGrokAndRegex(" 'T' ")); + assertEquals("No time format letter groups in override format [ 'T' ]", e.getMessage()); + } + + public void testMakeCandidateFromOverrideFormat() { + + // Override is a special format + assertSame(TimestampFormatFinder.ISO8601_CANDIDATE_FORMAT, + TimestampFormatFinder.makeCandidateFromOverrideFormat("ISO8601", NOOP_TIMEOUT_CHECKER)); + assertSame(TimestampFormatFinder.UNIX_MS_CANDIDATE_FORMAT, + TimestampFormatFinder.makeCandidateFromOverrideFormat("UNIX_MS", NOOP_TIMEOUT_CHECKER)); + assertSame(TimestampFormatFinder.UNIX_CANDIDATE_FORMAT, + TimestampFormatFinder.makeCandidateFromOverrideFormat("UNIX", NOOP_TIMEOUT_CHECKER)); + assertSame(TimestampFormatFinder.TAI64N_CANDIDATE_FORMAT, + TimestampFormatFinder.makeCandidateFromOverrideFormat("TAI64N", NOOP_TIMEOUT_CHECKER)); + + // Override is covered by a built-in format + TimestampFormatFinder.CandidateTimestampFormat candidate = + TimestampFormatFinder.makeCandidateFromOverrideFormat("yyyy-MM-dd'T'HH:mm:ss.SSS", NOOP_TIMEOUT_CHECKER); + assertEquals(TimestampFormatFinder.ISO8601_CANDIDATE_FORMAT.outputGrokPatternName, candidate.outputGrokPatternName); + assertEquals(TimestampFormatFinder.ISO8601_CANDIDATE_FORMAT.strictGrokPattern, candidate.strictGrokPattern); + // Can't compare Grok objects as Grok doesn't implement equals() + assertEquals(TimestampFormatFinder.ISO8601_CANDIDATE_FORMAT.simplePattern.pattern(), candidate.simplePattern.pattern()); + // Exact format supplied is returned if it matches + assertEquals(Collections.singletonList("yyyy-MM-dd'T'HH:mm:ss.SSS"), + candidate.javaTimestampFormatSupplier.apply("2018-05-15T16:14:56.374")); + // Other supported formats are returned if exact format doesn't match + assertEquals(Collections.singletonList("ISO8601"), candidate.javaTimestampFormatSupplier.apply("2018-05-15T16:14:56,374")); + + // Override is supported but not covered by any built-in format + candidate = + TimestampFormatFinder.makeCandidateFromOverrideFormat("MM/dd/yyyy H:mm:ss zzz", NOOP_TIMEOUT_CHECKER); + assertEquals(TimestampFormatFinder.CUSTOM_TIMESTAMP_GROK_NAME, candidate.outputGrokPatternName); + assertEquals("%{MONTHNUM2}/%{MONTHDAY}/%{YEAR} %{HOUR}:%{MINUTE}:%{SECOND} %{TZ}", candidate.strictGrokPattern); + assertEquals("\\b\\d{2}/\\d{2}/\\d{4} \\d{1,2}:\\d{2}:\\d{2} [A-Z]{3}\\b", candidate.simplePattern.pattern()); + assertEquals(Collections.singletonList("MM/dd/yyyy H:mm:ss zzz"), + candidate.javaTimestampFormatSupplier.apply("05/15/2018 16:14:56 UTC")); + + candidate = + TimestampFormatFinder.makeCandidateFromOverrideFormat("M/d/yyyy H:mm:ss zzz", NOOP_TIMEOUT_CHECKER); + assertEquals(TimestampFormatFinder.CUSTOM_TIMESTAMP_GROK_NAME, candidate.outputGrokPatternName); + assertEquals("%{MONTHNUM}/%{MONTHDAY}/%{YEAR} %{HOUR}:%{MINUTE}:%{SECOND} %{TZ}", candidate.strictGrokPattern); + assertEquals("\\b\\d{1,2}/\\d{1,2}/\\d{4} \\d{1,2}:\\d{2}:\\d{2} [A-Z]{3}\\b", candidate.simplePattern.pattern()); + assertEquals(Collections.singletonList("M/d/yyyy H:mm:ss zzz"), + candidate.javaTimestampFormatSupplier.apply("5/15/2018 16:14:56 UTC")); + } + + public void testRequiresTimezoneDependentParsing() { + + assertTrue(TimestampFormatFinder.TimestampMatch.requiresTimezoneDependentParsing("ISO8601", "2018-05-15T17:14:56")); + assertFalse(TimestampFormatFinder.TimestampMatch.requiresTimezoneDependentParsing("ISO8601", "2018-05-15T17:14:56Z")); + assertFalse(TimestampFormatFinder.TimestampMatch.requiresTimezoneDependentParsing("ISO8601", "2018-05-15T17:14:56-0100")); + assertFalse(TimestampFormatFinder.TimestampMatch.requiresTimezoneDependentParsing("ISO8601", "2018-05-15T17:14:56+01:00")); + + assertFalse(TimestampFormatFinder.TimestampMatch.requiresTimezoneDependentParsing("UNIX_MS", "1526400896374")); + assertFalse(TimestampFormatFinder.TimestampMatch.requiresTimezoneDependentParsing("UNIX", "1526400896")); + assertFalse(TimestampFormatFinder.TimestampMatch.requiresTimezoneDependentParsing("TAI64N", "400000005afb078a164ac980")); + + assertFalse(TimestampFormatFinder.TimestampMatch.requiresTimezoneDependentParsing("EEE, dd MMM yyyy HH:mm:ss XXX", + "Tue, 15 May 2018 17:14:56 +01:00")); + assertTrue(TimestampFormatFinder.TimestampMatch.requiresTimezoneDependentParsing("yyyyMMddHHmmss", "20180515171456")); + assertFalse(TimestampFormatFinder.TimestampMatch.requiresTimezoneDependentParsing("EEE MMM dd yy HH:mm:ss zzz", + "Tue May 15 18 16:14:56 UTC")); + assertFalse(TimestampFormatFinder.TimestampMatch.requiresTimezoneDependentParsing("yyyy-MM-dd HH:mm:ss,SSS XX", + "2018-05-15 17:14:56,374 +0100")); + assertTrue(TimestampFormatFinder.TimestampMatch.requiresTimezoneDependentParsing("MMM dd HH:mm:ss.SSS", "May 15 17:14:56.725")); + + assertTrue(TimestampFormatFinder.TimestampMatch.requiresTimezoneDependentParsing("yyyy.MM.dd'zXz'HH:mm:ss", + "2018.05.15zXz17:14:56")); + assertTrue(TimestampFormatFinder.TimestampMatch.requiresTimezoneDependentParsing("yyyy.MM.dd HH:mm:ss'z'", + "2018.05.15 17:14:56z")); + assertTrue(TimestampFormatFinder.TimestampMatch.requiresTimezoneDependentParsing("'XX'yyyy.MM.dd HH:mm:ss", + "XX2018.05.15 17:14:56")); + assertFalse(TimestampFormatFinder.TimestampMatch.requiresTimezoneDependentParsing("'XX'yyyy.MM.dd HH:mm:ssXX", + "XX2018.05.15 17:14:56Z")); + } + + public void testParseIndeterminateDateNumbers() { + + // Simplest case - nothing is indeterminate + int[] indeterminateDateNumbers = + TimestampFormatFinder.TimestampMatch.parseIndeterminateDateNumbers("2018-05-15T16:14:56,374Z", + Collections.singletonList("yyyy-MM-dd'T'HH:mm:ss,SSSXX")); + assertEquals(2, indeterminateDateNumbers.length); + assertEquals(-1, indeterminateDateNumbers[0]); + assertEquals(-1, indeterminateDateNumbers[1]); + + // US with padding + indeterminateDateNumbers = + TimestampFormatFinder.TimestampMatch.parseIndeterminateDateNumbers("05/15/2018 16:14:56", + Collections.singletonList("??/??/yyyy HH:mm:ss")); + assertEquals(2, indeterminateDateNumbers.length); + assertEquals(5, indeterminateDateNumbers[0]); + assertEquals(15, indeterminateDateNumbers[1]); + + // US without padding + indeterminateDateNumbers = + TimestampFormatFinder.TimestampMatch.parseIndeterminateDateNumbers("5/15/2018 16:14:56", + Collections.singletonList("?/?/yyyy HH:mm:ss")); + assertEquals(2, indeterminateDateNumbers.length); + assertEquals(5, indeterminateDateNumbers[0]); + assertEquals(15, indeterminateDateNumbers[1]); + + // EU with padding + indeterminateDateNumbers = + TimestampFormatFinder.TimestampMatch.parseIndeterminateDateNumbers("15/05/2018 16:14:56", + Collections.singletonList("??/??/yyyy HH:mm:ss")); + assertEquals(2, indeterminateDateNumbers.length); + assertEquals(15, indeterminateDateNumbers[0]); + assertEquals(5, indeterminateDateNumbers[1]); + + // EU without padding + indeterminateDateNumbers = + TimestampFormatFinder.TimestampMatch.parseIndeterminateDateNumbers("15/5/2018 16:14:56", + Collections.singletonList("?/?/yyyy HH:mm:ss")); + assertEquals(2, indeterminateDateNumbers.length); + assertEquals(15, indeterminateDateNumbers[0]); + assertEquals(5, indeterminateDateNumbers[1]); + } + + public void testDeterminiseJavaTimestampFormat() { + + // Indeterminate at the beginning of the pattern + assertEquals("dd/MM/yyyy HH:mm:ss", TimestampFormatFinder.determiniseJavaTimestampFormat("??/??/yyyy HH:mm:ss", true)); + assertEquals("MM/dd/yyyy HH:mm:ss", TimestampFormatFinder.determiniseJavaTimestampFormat("??/??/yyyy HH:mm:ss", false)); + assertEquals("d/M/yyyy HH:mm:ss", TimestampFormatFinder.determiniseJavaTimestampFormat("?/?/yyyy HH:mm:ss", true)); + assertEquals("M/d/yyyy HH:mm:ss", TimestampFormatFinder.determiniseJavaTimestampFormat("?/?/yyyy HH:mm:ss", false)); + // Indeterminate in the middle of the pattern + assertEquals("HH:mm:ss dd/MM/yyyy", TimestampFormatFinder.determiniseJavaTimestampFormat("HH:mm:ss ??/??/yyyy", true)); + assertEquals("HH:mm:ss MM/dd/yyyy", TimestampFormatFinder.determiniseJavaTimestampFormat("HH:mm:ss ??/??/yyyy", false)); + assertEquals("HH:mm:ss d/M/yyyy", TimestampFormatFinder.determiniseJavaTimestampFormat("HH:mm:ss ?/?/yyyy", true)); + assertEquals("HH:mm:ss M/d/yyyy", TimestampFormatFinder.determiniseJavaTimestampFormat("HH:mm:ss ?/?/yyyy", false)); + // No separators + assertEquals("ddMMyyyyHHmmss", TimestampFormatFinder.determiniseJavaTimestampFormat("????yyyyHHmmss", true)); + assertEquals("MMddyyyyHHmmss", TimestampFormatFinder.determiniseJavaTimestampFormat("????yyyyHHmmss", false)); + // It's unreasonable to expect a variable length format like 'd' or 'M' to work without separators + } + + public void testGuessIsDayFirstFromFormats() { + + TimestampFormatFinder timestampFormatFinder = new TimestampFormatFinder(explanation, true, true, true, NOOP_TIMEOUT_CHECKER); + + timestampFormatFinder.addSample("05/5/2018 16:14:56"); + timestampFormatFinder.addSample("06/6/2018 17:14:56"); + timestampFormatFinder.addSample("07/7/2018 18:14:56"); + + // This is based on the fact that %{MONTHNUM} can match a single digit whereas %{MONTHDAY} cannot + assertTrue(timestampFormatFinder.guessIsDayFirstFromFormats(timestampFormatFinder.getRawJavaTimestampFormats())); + + timestampFormatFinder = new TimestampFormatFinder(explanation, true, true, true, NOOP_TIMEOUT_CHECKER); + + timestampFormatFinder.addSample("5/05/2018 16:14:56"); + timestampFormatFinder.addSample("6/06/2018 17:14:56"); + timestampFormatFinder.addSample("7/07/2018 18:14:56"); + + // This is based on the fact that %{MONTHNUM} can match a single digit whereas %{MONTHDAY} cannot + assertFalse(timestampFormatFinder.guessIsDayFirstFromFormats(timestampFormatFinder.getRawJavaTimestampFormats())); + + timestampFormatFinder = new TimestampFormatFinder(explanation, true, true, true, NOOP_TIMEOUT_CHECKER); + + timestampFormatFinder.addSample("5/05/2018 16:14:56"); + timestampFormatFinder.addSample("06/6/2018 17:14:56"); + timestampFormatFinder.addSample("7/07/2018 18:14:56"); + + // Inconsistent so no decision + assertNull(timestampFormatFinder.guessIsDayFirstFromFormats(timestampFormatFinder.getRawJavaTimestampFormats())); + } + + public void testGuessIsDayFirstFromMatchesSingleFormat() { + + TimestampFormatFinder timestampFormatFinder = new TimestampFormatFinder(explanation, true, true, true, NOOP_TIMEOUT_CHECKER); + + timestampFormatFinder.addSample("05/05/2018 16:14:56"); + timestampFormatFinder.addSample("05/15/2018 17:14:56"); + timestampFormatFinder.addSample("05/25/2018 18:14:56"); + + assertFalse(timestampFormatFinder.guessIsDayFirstFromMatches(null)); + assertFalse(timestampFormatFinder.guessIsDayFirstFromMatches(null)); + + timestampFormatFinder = new TimestampFormatFinder(explanation, true, true, true, NOOP_TIMEOUT_CHECKER); + + timestampFormatFinder.addSample("05/05/2018 16:14:56"); + timestampFormatFinder.addSample("15/05/2018 17:14:56"); + timestampFormatFinder.addSample("25/05/2018 18:14:56"); + + assertTrue(timestampFormatFinder.guessIsDayFirstFromMatches(null)); + assertTrue(timestampFormatFinder.guessIsDayFirstFromMatches(null)); + + timestampFormatFinder = new TimestampFormatFinder(explanation, true, true, true, NOOP_TIMEOUT_CHECKER); + + timestampFormatFinder.addSample("05/05/2018 16:14:56"); + timestampFormatFinder.addSample("05/06/2018 17:14:56"); + timestampFormatFinder.addSample("05/07/2018 18:14:56"); + + // Second number has 3 values, first only 1, so guess second is day + assertFalse(timestampFormatFinder.guessIsDayFirstFromMatches(null)); + assertFalse(timestampFormatFinder.guessIsDayFirstFromMatches(null)); + + timestampFormatFinder = new TimestampFormatFinder(explanation, true, true, true, NOOP_TIMEOUT_CHECKER); + + timestampFormatFinder.addSample("05/05/2018 16:14:56"); + timestampFormatFinder.addSample("06/05/2018 17:14:56"); + timestampFormatFinder.addSample("07/05/2018 18:14:56"); + + // First number has 3 values, second only 1, so guess first is day + assertTrue(timestampFormatFinder.guessIsDayFirstFromMatches(null)); + assertTrue(timestampFormatFinder.guessIsDayFirstFromMatches(null)); + + timestampFormatFinder = new TimestampFormatFinder(explanation, true, true, true, NOOP_TIMEOUT_CHECKER); + + timestampFormatFinder.addSample("05/05/2018 16:14:56"); + timestampFormatFinder.addSample("06/06/2018 17:14:56"); + timestampFormatFinder.addSample("07/07/2018 18:14:56"); + + // Insufficient evidence to decide + assertNull(timestampFormatFinder.guessIsDayFirstFromMatches(null)); + assertNull(timestampFormatFinder.guessIsDayFirstFromMatches(null)); + } + + public void testGuessIsDayFirstFromMatchesMultipleFormats() { + + // Similar to the test above, but with the possibility that the secondary + // ISO8601 formats cause confusion - this test proves that they don't + + TimestampFormatFinder.TimestampFormat expectedPrimaryFormat = + new TimestampFormatFinder.TimestampFormat(Collections.singletonList("??/??/yyyy HH:mm:ss"), + Pattern.compile("\\b\\d{1,2}[/.-]\\d{1,2}[/.-]\\d{4}[- ]\\d{2}:\\d{2}:\\d{2}\\b"), "DATESTAMP", Collections.emptyMap(), ""); + + TimestampFormatFinder timestampFormatFinder = new TimestampFormatFinder(explanation, true, true, false, NOOP_TIMEOUT_CHECKER); + + timestampFormatFinder.addSample("05/05/2018 16:14:56"); + timestampFormatFinder.addSample("2018-05-15T17:14:56"); + timestampFormatFinder.addSample("05/15/2018 17:14:56"); + timestampFormatFinder.addSample("2018-05-25T18:14:56"); + timestampFormatFinder.addSample("05/25/2018 18:14:56"); + + assertFalse(timestampFormatFinder.guessIsDayFirstFromMatches(expectedPrimaryFormat)); + assertFalse(timestampFormatFinder.guessIsDayFirstFromMatches(expectedPrimaryFormat)); + + timestampFormatFinder = new TimestampFormatFinder(explanation, true, true, false, NOOP_TIMEOUT_CHECKER); + + timestampFormatFinder.addSample("05/05/2018 16:14:56"); + timestampFormatFinder.addSample("2018-05-15T17:14:56"); + timestampFormatFinder.addSample("15/05/2018 17:14:56"); + timestampFormatFinder.addSample("2018-05-25T18:14:56"); + timestampFormatFinder.addSample("25/05/2018 18:14:56"); + + assertTrue(timestampFormatFinder.guessIsDayFirstFromMatches(expectedPrimaryFormat)); + assertTrue(timestampFormatFinder.guessIsDayFirstFromMatches(expectedPrimaryFormat)); + + timestampFormatFinder = new TimestampFormatFinder(explanation, true, true, false, NOOP_TIMEOUT_CHECKER); + + timestampFormatFinder.addSample("05/05/2018 16:14:56"); + timestampFormatFinder.addSample("2018-05-06T17:14:56"); + timestampFormatFinder.addSample("05/06/2018 17:14:56"); + timestampFormatFinder.addSample("2018-05-07T18:14:56"); + timestampFormatFinder.addSample("05/07/2018 18:14:56"); + + // Second number has 3 values, first only 1, so guess second is day + assertFalse(timestampFormatFinder.guessIsDayFirstFromMatches(expectedPrimaryFormat)); + assertFalse(timestampFormatFinder.guessIsDayFirstFromMatches(expectedPrimaryFormat)); + + timestampFormatFinder = new TimestampFormatFinder(explanation, true, true, false, NOOP_TIMEOUT_CHECKER); + + timestampFormatFinder.addSample("05/05/2018 16:14:56"); + timestampFormatFinder.addSample("2018-05-06T17:14:56"); + timestampFormatFinder.addSample("06/05/2018 17:14:56"); + timestampFormatFinder.addSample("2018-05-07T18:14:56"); + timestampFormatFinder.addSample("07/05/2018 18:14:56"); + + // First number has 3 values, second only 1, so guess first is day + assertTrue(timestampFormatFinder.guessIsDayFirstFromMatches(expectedPrimaryFormat)); + assertTrue(timestampFormatFinder.guessIsDayFirstFromMatches(expectedPrimaryFormat)); + + timestampFormatFinder = new TimestampFormatFinder(explanation, true, true, false, NOOP_TIMEOUT_CHECKER); + + timestampFormatFinder.addSample("05/05/2018 16:14:56"); + timestampFormatFinder.addSample("2018-06-06T17:14:56"); + timestampFormatFinder.addSample("06/06/2018 17:14:56"); + timestampFormatFinder.addSample("2018-07-07T18:14:56"); + timestampFormatFinder.addSample("07/07/2018 18:14:56"); + + // Insufficient evidence to decide + assertNull(timestampFormatFinder.guessIsDayFirstFromMatches(expectedPrimaryFormat)); + assertNull(timestampFormatFinder.guessIsDayFirstFromMatches(expectedPrimaryFormat)); + } + + public void testGuessIsDayFirstFromLocale() { + + TimestampFormatFinder timestampFormatFinder = new TimestampFormatFinder(explanation, true, true, true, NOOP_TIMEOUT_CHECKER); + + // Locale fallback is the only way to decide + assertFalse(timestampFormatFinder.guessIsDayFirstFromLocale(Locale.US)); + assertTrue(timestampFormatFinder.guessIsDayFirstFromLocale(Locale.UK)); + assertTrue(timestampFormatFinder.guessIsDayFirstFromLocale(Locale.FRANCE)); + assertFalse(timestampFormatFinder.guessIsDayFirstFromLocale(Locale.JAPAN)); + } + + public void testStringToNumberPosBitSet() { + + BitSet bitSet = TimestampFormatFinder.stringToNumberPosBitSet(""); + assertTrue(bitSet.isEmpty()); + assertEquals(0, bitSet.length()); + + bitSet = TimestampFormatFinder.stringToNumberPosBitSet(" 1"); + assertEquals(2, bitSet.length()); + assertFalse(bitSet.get(0)); + assertTrue(bitSet.get(1)); + + bitSet = TimestampFormatFinder.stringToNumberPosBitSet("1 1 1"); + assertEquals(5, bitSet.length()); + assertTrue(bitSet.get(0)); + assertFalse(bitSet.get(1)); + assertTrue(bitSet.get(2)); + assertFalse(bitSet.get(3)); + assertTrue(bitSet.get(4)); + + bitSet = TimestampFormatFinder.stringToNumberPosBitSet("05/05/2018 16:14:56"); + assertEquals(19, bitSet.length()); + assertTrue(bitSet.get(0)); + assertTrue(bitSet.get(1)); + assertFalse(bitSet.get(2)); + assertTrue(bitSet.get(3)); + assertTrue(bitSet.get(4)); + assertFalse(bitSet.get(5)); + assertTrue(bitSet.get(6)); + assertTrue(bitSet.get(7)); + assertTrue(bitSet.get(8)); + assertTrue(bitSet.get(9)); + assertFalse(bitSet.get(10)); + assertTrue(bitSet.get(11)); + assertTrue(bitSet.get(12)); + assertFalse(bitSet.get(13)); + assertTrue(bitSet.get(14)); + assertTrue(bitSet.get(15)); + assertFalse(bitSet.get(16)); + assertTrue(bitSet.get(17)); + assertTrue(bitSet.get(18)); + } + + public void testFindBitPattern() { + + BitSet findIn = TimestampFormatFinder.stringToNumberPosBitSet(""); + BitSet toFind = TimestampFormatFinder.stringToNumberPosBitSet(""); + assertEquals(0, TimestampFormatFinder.findBitPattern(findIn, 0, toFind)); + + findIn = TimestampFormatFinder.stringToNumberPosBitSet("1 1 1"); + toFind = TimestampFormatFinder.stringToNumberPosBitSet(""); + assertEquals(0, TimestampFormatFinder.findBitPattern(findIn, 0, toFind)); + assertEquals(1, TimestampFormatFinder.findBitPattern(findIn, 1, toFind)); + assertEquals(2, TimestampFormatFinder.findBitPattern(findIn, 2, toFind)); + + findIn = TimestampFormatFinder.stringToNumberPosBitSet("1 1 1"); + toFind = TimestampFormatFinder.stringToNumberPosBitSet("1"); + assertEquals(0, TimestampFormatFinder.findBitPattern(findIn, 0, toFind)); + assertEquals(2, TimestampFormatFinder.findBitPattern(findIn, 1, toFind)); + assertEquals(2, TimestampFormatFinder.findBitPattern(findIn, 2, toFind)); + + findIn = TimestampFormatFinder.stringToNumberPosBitSet("1 1 1"); + toFind = TimestampFormatFinder.stringToNumberPosBitSet(" 1"); + assertEquals(1, TimestampFormatFinder.findBitPattern(findIn, 0, toFind)); + assertEquals(1, TimestampFormatFinder.findBitPattern(findIn, 1, toFind)); + assertEquals(3, TimestampFormatFinder.findBitPattern(findIn, 2, toFind)); + + findIn = TimestampFormatFinder.stringToNumberPosBitSet("1 1 1"); + toFind = TimestampFormatFinder.stringToNumberPosBitSet("1 1"); + assertEquals(0, TimestampFormatFinder.findBitPattern(findIn, 0, toFind)); + assertEquals(2, TimestampFormatFinder.findBitPattern(findIn, 1, toFind)); + assertEquals(2, TimestampFormatFinder.findBitPattern(findIn, 2, toFind)); + assertEquals(-1, TimestampFormatFinder.findBitPattern(findIn, 3, toFind)); + + findIn = TimestampFormatFinder.stringToNumberPosBitSet("1 11 1 1"); + toFind = TimestampFormatFinder.stringToNumberPosBitSet("11 1"); + assertEquals(2, TimestampFormatFinder.findBitPattern(findIn, 0, toFind)); + assertEquals(2, TimestampFormatFinder.findBitPattern(findIn, 1, toFind)); + assertEquals(2, TimestampFormatFinder.findBitPattern(findIn, 2, toFind)); + assertEquals(-1, TimestampFormatFinder.findBitPattern(findIn, 3, toFind)); + + findIn = TimestampFormatFinder.stringToNumberPosBitSet("1 11 1 1"); + toFind = TimestampFormatFinder.stringToNumberPosBitSet(" 11 1"); + assertEquals(1, TimestampFormatFinder.findBitPattern(findIn, 0, toFind)); + assertEquals(1, TimestampFormatFinder.findBitPattern(findIn, 1, toFind)); + assertEquals(-1, TimestampFormatFinder.findBitPattern(findIn, 2, toFind)); + + findIn = TimestampFormatFinder.stringToNumberPosBitSet("1 11 1 1"); + toFind = TimestampFormatFinder.stringToNumberPosBitSet(" 1 1"); + assertEquals(4, TimestampFormatFinder.findBitPattern(findIn, 0, toFind)); + assertEquals(4, TimestampFormatFinder.findBitPattern(findIn, 4, toFind)); + assertEquals(-1, TimestampFormatFinder.findBitPattern(findIn, 5, toFind)); + } + + public void testFindBoundsForCandidate() { + + final TimestampFormatFinder.CandidateTimestampFormat httpdCandidateFormat = TimestampFormatFinder.ORDERED_CANDIDATE_FORMATS + .stream().filter(candidate -> candidate.outputGrokPatternName.equals("HTTPDATE")).findAny().get(); + + BitSet numberPosBitSet = TimestampFormatFinder.stringToNumberPosBitSet("[2018-05-11T17:07:29,553][INFO ]" + + "[o.e.e.NodeEnvironment ] [node-0] heap size [3.9gb], compressed ordinary object pointers [true]"); + assertEquals(new Tuple<>(1, 36), + TimestampFormatFinder.findBoundsForCandidate(TimestampFormatFinder.ISO8601_CANDIDATE_FORMAT, numberPosBitSet)); + assertEquals(new Tuple<>(-1, -1), TimestampFormatFinder.findBoundsForCandidate(httpdCandidateFormat, numberPosBitSet)); + // TAI64N doesn't necessarily contain digits, so this functionality cannot guarantee that it won't match somewhere in the text + assertEquals(new Tuple<>(0, Integer.MAX_VALUE), + TimestampFormatFinder.findBoundsForCandidate(TimestampFormatFinder.TAI64N_CANDIDATE_FORMAT, numberPosBitSet)); + + numberPosBitSet = TimestampFormatFinder.stringToNumberPosBitSet("192.168.62.101 - - [29/Jun/2016:12:11:31 +0000] " + + "\"POST //apiserv:8080/engine/v2/jobs HTTP/1.1\" 201 42 \"-\" \"curl/7.46.0\" 384"); + assertEquals(new Tuple<>(-1, -1), + TimestampFormatFinder.findBoundsForCandidate(TimestampFormatFinder.ISO8601_CANDIDATE_FORMAT, numberPosBitSet)); + assertEquals(new Tuple<>(20, 46), TimestampFormatFinder.findBoundsForCandidate(httpdCandidateFormat, numberPosBitSet)); + assertEquals(new Tuple<>(0, Integer.MAX_VALUE), + TimestampFormatFinder.findBoundsForCandidate(TimestampFormatFinder.TAI64N_CANDIDATE_FORMAT, numberPosBitSet)); + } + + public void testFindFormatGivenNoMatch() { + + validateNoTimestampMatch(""); + validateNoTimestampMatch("no timestamps in here"); + validateNoTimestampMatch(":::"); + validateNoTimestampMatch("/+"); + } + + public void testFindFormatGivenOnlyIso8601() { + + validateTimestampMatch("2018-05-15T16:14:56,374Z", "TIMESTAMP_ISO8601", "\\b\\d{4}-\\d{2}-\\d{2}[T ]\\d{2}:\\d{2}", + "ISO8601", 1526400896374L); + validateTimestampMatch("2018-05-15T17:14:56,374+0100", "TIMESTAMP_ISO8601", "\\b\\d{4}-\\d{2}-\\d{2}[T ]\\d{2}:\\d{2}", + "ISO8601", 1526400896374L); + validateTimestampMatch("2018-05-15T17:14:56,374+01:00", "TIMESTAMP_ISO8601", "\\b\\d{4}-\\d{2}-\\d{2}[T ]\\d{2}:\\d{2}", + "ISO8601", 1526400896374L); + validateTimestampMatch("2018-05-15T17:14:56,374", "TIMESTAMP_ISO8601", "\\b\\d{4}-\\d{2}-\\d{2}[T ]\\d{2}:\\d{2}", + "ISO8601", 1526400896374L); + + validateTimestampMatch("2018-05-15T16:14:56Z", "TIMESTAMP_ISO8601", "\\b\\d{4}-\\d{2}-\\d{2}[T ]\\d{2}:\\d{2}", + "ISO8601", 1526400896000L); + validateTimestampMatch("2018-05-15T17:14:56+0100", "TIMESTAMP_ISO8601", "\\b\\d{4}-\\d{2}-\\d{2}[T ]\\d{2}:\\d{2}", + "ISO8601", 1526400896000L); + validateTimestampMatch("2018-05-15T17:14:56+01:00", "TIMESTAMP_ISO8601", "\\b\\d{4}-\\d{2}-\\d{2}[T ]\\d{2}:\\d{2}", + "ISO8601", 1526400896000L); + validateTimestampMatch("2018-05-15T17:14:56", "TIMESTAMP_ISO8601", "\\b\\d{4}-\\d{2}-\\d{2}[T ]\\d{2}:\\d{2}", + "ISO8601", 1526400896000L); + + validateTimestampMatch("2018-05-15T16:14Z", "TIMESTAMP_ISO8601", "\\b\\d{4}-\\d{2}-\\d{2}[T ]\\d{2}:\\d{2}", + "ISO8601", 1526400840000L); + validateTimestampMatch("2018-05-15T17:14+0100", "TIMESTAMP_ISO8601", "\\b\\d{4}-\\d{2}-\\d{2}[T ]\\d{2}:\\d{2}", + "ISO8601", 1526400840000L); + validateTimestampMatch("2018-05-15T17:14+01:00", "TIMESTAMP_ISO8601", "\\b\\d{4}-\\d{2}-\\d{2}[T ]\\d{2}:\\d{2}", + "ISO8601", 1526400840000L); + validateTimestampMatch("2018-05-15T17:14", "TIMESTAMP_ISO8601", "\\b\\d{4}-\\d{2}-\\d{2}[T ]\\d{2}:\\d{2}", + "ISO8601", 1526400840000L); + + // TIMESTAMP_ISO8601 doesn't match ISO8601 if it's only a date with no time + validateTimestampMatch("2018-05-15", "CUSTOM_TIMESTAMP", "\\b\\d{4}-\\d{2}-\\d{2}\\b", "ISO8601", 1526338800000L); + + validateTimestampMatch("2018-05-15 16:14:56,374Z", "TIMESTAMP_ISO8601", "\\b\\d{4}-\\d{2}-\\d{2}[T ]\\d{2}:\\d{2}", + "yyyy-MM-dd HH:mm:ss,SSSXX", 1526400896374L); + validateTimestampMatch("2018-05-15 17:14:56,374+0100", "TIMESTAMP_ISO8601", "\\b\\d{4}-\\d{2}-\\d{2}[T ]\\d{2}:\\d{2}", + "yyyy-MM-dd HH:mm:ss,SSSXX", 1526400896374L); + validateTimestampMatch("2018-05-15 17:14:56,374+01:00", "TIMESTAMP_ISO8601", "\\b\\d{4}-\\d{2}-\\d{2}[T ]\\d{2}:\\d{2}", + "yyyy-MM-dd HH:mm:ss,SSSXXX", 1526400896374L); + validateTimestampMatch("2018-05-15 17:14:56,374", "TIMESTAMP_ISO8601", "\\b\\d{4}-\\d{2}-\\d{2}[T ]\\d{2}:\\d{2}", + "yyyy-MM-dd HH:mm:ss,SSS", 1526400896374L); + + validateTimestampMatch("2018-05-15 16:14:56Z", "TIMESTAMP_ISO8601", "\\b\\d{4}-\\d{2}-\\d{2}[T ]\\d{2}:\\d{2}", + "yyyy-MM-dd HH:mm:ssXX", 1526400896000L); + validateTimestampMatch("2018-05-15 17:14:56+0100", "TIMESTAMP_ISO8601", "\\b\\d{4}-\\d{2}-\\d{2}[T ]\\d{2}:\\d{2}", + "yyyy-MM-dd HH:mm:ssXX", 1526400896000L); + validateTimestampMatch("2018-05-15 17:14:56+01:00", "TIMESTAMP_ISO8601", "\\b\\d{4}-\\d{2}-\\d{2}[T ]\\d{2}:\\d{2}", + "yyyy-MM-dd HH:mm:ssXXX", 1526400896000L); + validateTimestampMatch("2018-05-15 17:14:56", "TIMESTAMP_ISO8601", "\\b\\d{4}-\\d{2}-\\d{2}[T ]\\d{2}:\\d{2}", + "yyyy-MM-dd HH:mm:ss", 1526400896000L); + + validateTimestampMatch("2018-05-15 16:14Z", "TIMESTAMP_ISO8601", "\\b\\d{4}-\\d{2}-\\d{2}[T ]\\d{2}:\\d{2}", + "yyyy-MM-dd HH:mmXX", 1526400840000L); + validateTimestampMatch("2018-05-15 17:14+0100", "TIMESTAMP_ISO8601", "\\b\\d{4}-\\d{2}-\\d{2}[T ]\\d{2}:\\d{2}", + "yyyy-MM-dd HH:mmXX", 1526400840000L); + validateTimestampMatch("2018-05-15 17:14+01:00", "TIMESTAMP_ISO8601", "\\b\\d{4}-\\d{2}-\\d{2}[T ]\\d{2}:\\d{2}", + "yyyy-MM-dd HH:mmXXX", 1526400840000L); + validateTimestampMatch("2018-05-15 17:14", "TIMESTAMP_ISO8601", "\\b\\d{4}-\\d{2}-\\d{2}[T ]\\d{2}:\\d{2}", + "yyyy-MM-dd HH:mm", 1526400840000L); + } + + public void testFindFormatGivenOnlyKnownTimestampFormat() { // Note: some of the time formats give millisecond accuracy, some second accuracy and some minute accuracy - validateTimestampMatch(new TimestampMatch(0, "", "YYYY-MM-dd HH:mm:ss,SSS Z", "yyyy-MM-dd HH:mm:ss,SSS XX", - "\\b\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2},\\d{3}", "TOMCAT_DATESTAMP", ""), "2018-05-15 17:14:56,374 +0100", - 1526400896374L); - - validateTimestampMatch(new TimestampMatch(11, "", "EEE MMM dd YYYY HH:mm:ss zzz", "EEE MMM dd yyyy HH:mm:ss zzz", - "\\b[A-Z]\\S{2,8} [A-Z]\\S{2,8} \\d{1,2} \\d{4} \\d{2}:\\d{2}:\\d{2} ", "DATESTAMP_RFC822", ""), - "Tue May 15 2018 16:14:56 UTC", 1526400896000L); - validateTimestampMatch(new TimestampMatch(12, "", "EEE MMM dd YYYY HH:mm zzz", "EEE MMM dd yyyy HH:mm zzz", - "\\b[A-Z]\\S{2,8} [A-Z]\\S{2,8} \\d{1,2} \\d{4} \\d{2}:\\d{2} ", "DATESTAMP_RFC822", ""), - "Tue May 15 2018 16:14 UTC", 1526400840000L); - - validateTimestampMatch(new TimestampMatch(13, "", "EEE, dd MMM YYYY HH:mm:ss ZZ", "EEE, dd MMM yyyy HH:mm:ss XXX", - "\\b[A-Z]\\S{2,8}, \\d{1,2} [A-Z]\\S{2,8} \\d{4} \\d{2}:\\d{2}:\\d{2} ", "DATESTAMP_RFC2822", ""), - "Tue, 15 May 2018 17:14:56 +01:00", 1526400896000L); - validateTimestampMatch(new TimestampMatch(14, "", "EEE, dd MMM YYYY HH:mm:ss Z", "EEE, dd MMM yyyy HH:mm:ss XX", - "\\b[A-Z]\\S{2,8}, \\d{1,2} [A-Z]\\S{2,8} \\d{4} \\d{2}:\\d{2}:\\d{2} ", "DATESTAMP_RFC2822", ""), - "Tue, 15 May 2018 17:14:56 +0100", 1526400896000L); - validateTimestampMatch(new TimestampMatch(15, "", "EEE, dd MMM YYYY HH:mm ZZ", "EEE, dd MMM yyyy HH:mm XXX", - "\\b[A-Z]\\S{2,8}, \\d{1,2} [A-Z]\\S{2,8} \\d{4} \\d{2}:\\d{2} ", "DATESTAMP_RFC2822", ""), - "Tue, 15 May 2018 17:14 +01:00", 1526400840000L); - validateTimestampMatch(new TimestampMatch(16, "", "EEE, dd MMM YYYY HH:mm Z", "EEE, dd MMM yyyy HH:mm XX", - "\\b[A-Z]\\S{2,8}, \\d{1,2} [A-Z]\\S{2,8} \\d{4} \\d{2}:\\d{2} ", "DATESTAMP_RFC2822", ""), "Tue, 15 May 2018 17:14 +0100", - 1526400840000L); - - validateTimestampMatch(new TimestampMatch(17, "", "EEE MMM dd HH:mm:ss zzz YYYY", "EEE MMM dd HH:mm:ss zzz yyyy", - "\\b[A-Z]\\S{2,8} [A-Z]\\S{2,8} \\d{1,2} \\d{2}:\\d{2}:\\d{2} [A-Z]{3,4} \\d{4}\\b", "DATESTAMP_OTHER", ""), - "Tue May 15 16:14:56 UTC 2018", 1526400896000L); - validateTimestampMatch(new TimestampMatch(18, "", "EEE MMM dd HH:mm zzz YYYY", "EEE MMM dd HH:mm zzz yyyy", - "\\b[A-Z]\\S{2,8} [A-Z]\\S{2,8} \\d{1,2} \\d{2}:\\d{2} [A-Z]{3,4} \\d{4}\\b", "DATESTAMP_OTHER", ""), - "Tue May 15 16:14 UTC 2018", 1526400840000L); - - validateTimestampMatch(new TimestampMatch(19, "", "YYYYMMddHHmmss", "yyyyMMddHHmmss", "\\b\\d{14}\\b", - "DATESTAMP_EVENTLOG", ""), - "20180515171456", 1526400896000L); - - validateTimestampMatch(new TimestampMatch(20, "", "EEE MMM dd HH:mm:ss YYYY", "EEE MMM dd HH:mm:ss yyyy", - "\\b[A-Z]\\S{2,8} [A-Z]\\S{2,8} \\d{1,2} \\d{2}:\\d{2}:\\d{2} \\d{4}\\b", "HTTPDERROR_DATE", ""), - "Tue May 15 17:14:56 2018", 1526400896000L); - - validateTimestampMatch(new TimestampMatch(21, "", Arrays.asList("MMM dd HH:mm:ss.SSS", "MMM d HH:mm:ss.SSS"), - Arrays.asList("MMM dd HH:mm:ss.SSS", "MMM d HH:mm:ss.SSS"), - "\\b[A-Z]\\S{2,8} {1,2}\\d{1,2} \\d{2}:\\d{2}:\\d{2}\\.\\d{3}", "SYSLOGTIMESTAMP", ""), "May 15 17:14:56.725", 1526400896725L); - validateTimestampMatch(new TimestampMatch(22, "", Arrays.asList("MMM dd HH:mm:ss", "MMM d HH:mm:ss"), - Arrays.asList("MMM dd HH:mm:ss", "MMM d HH:mm:ss"), - "\\b[A-Z]\\S{2,8} {1,2}\\d{1,2} \\d{2}:\\d{2}:\\d{2}\\b", "SYSLOGTIMESTAMP", ""), "May 15 17:14:56", 1526400896000L); - - validateTimestampMatch(new TimestampMatch(23, "", "dd/MMM/YYYY:HH:mm:ss Z", "dd/MMM/yyyy:HH:mm:ss XX", - "\\b\\d{2}/[A-Z]\\S{2}/\\d{4}:\\d{2}:\\d{2}:\\d{2} ", "HTTPDATE", ""), "15/May/2018:17:14:56 +0100", 1526400896000L); - - validateTimestampMatch(new TimestampMatch(24, "", "MMM dd, YYYY h:mm:ss a", "MMM dd, yyyy h:mm:ss a", - "\\b[A-Z]\\S{2,8} \\d{1,2}, \\d{4} \\d{1,2}:\\d{2}:\\d{2} [AP]M\\b", "CATALINA_DATESTAMP", ""), "May 15, 2018 5:14:56 PM", - 1526400896000L); - - validateTimestampMatch(new TimestampMatch(25, "", Arrays.asList("MMM dd YYYY HH:mm:ss", "MMM d YYYY HH:mm:ss"), - Arrays.asList("MMM dd yyyy HH:mm:ss", "MMM d yyyy HH:mm:ss"), - "\\b[A-Z]\\S{2,8} {1,2}\\d{1,2} \\d{4} \\d{2}:\\d{2}:\\d{2}\\b", "CISCOTIMESTAMP", ""), "May 15 2018 17:14:56", - 1526400896000L); - } - - public void testFindFirstMatchGivenOnlySystemDate() { - - assertEquals(new TimestampMatch(26, "", "UNIX_MS", "UNIX_MS", "\\b\\d{13}\\b", "POSINT", ""), - TimestampFormatFinder.findFirstMatch("1526400896374", NOOP_TIMEOUT_CHECKER)); - assertEquals(new TimestampMatch(26, "", "UNIX_MS", "UNIX_MS", "\\b\\d{13}\\b", "POSINT", ""), - TimestampFormatFinder.findFirstFullMatch("1526400896374", NOOP_TIMEOUT_CHECKER)); - - assertEquals(new TimestampMatch(27, "", "UNIX", "UNIX", "\\b\\d{10}\\.\\d{3,9}\\b", "NUMBER", ""), - TimestampFormatFinder.findFirstMatch("1526400896.736", NOOP_TIMEOUT_CHECKER)); - assertEquals(new TimestampMatch(27, "", "UNIX", "UNIX", "\\b\\d{10}\\.\\d{3,9}\\b", "NUMBER", ""), - TimestampFormatFinder.findFirstFullMatch("1526400896.736", NOOP_TIMEOUT_CHECKER)); - assertEquals(new TimestampMatch(28, "", "UNIX", "UNIX", "\\b\\d{10}\\b", "POSINT", ""), - TimestampFormatFinder.findFirstMatch("1526400896", NOOP_TIMEOUT_CHECKER)); - assertEquals(new TimestampMatch(28, "", "UNIX", "UNIX", "\\b\\d{10}\\b", "POSINT", ""), - TimestampFormatFinder.findFirstFullMatch("1526400896", NOOP_TIMEOUT_CHECKER)); - - assertEquals(new TimestampMatch(29, "", "TAI64N", "TAI64N", "\\b[0-9A-Fa-f]{24}\\b", "BASE16NUM", ""), - TimestampFormatFinder.findFirstMatch("400000005afb159a164ac980", NOOP_TIMEOUT_CHECKER)); - assertEquals(new TimestampMatch(29, "", "TAI64N", "TAI64N", "\\b[0-9A-Fa-f]{24}\\b", "BASE16NUM", ""), - TimestampFormatFinder.findFirstFullMatch("400000005afb159a164ac980", NOOP_TIMEOUT_CHECKER)); - } - - public void testFindFirstMatchGivenRealLogMessages() { - - assertEquals(new TimestampMatch(9, "[", "ISO8601", "yyyy-MM-dd'T'HH:mm:ss,SSS", - "\\b\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2},\\d{3}", "TIMESTAMP_ISO8601", - "][INFO ][o.e.e.NodeEnvironment ] [node-0] heap size [3.9gb], compressed ordinary object pointers [true]"), - TimestampFormatFinder.findFirstMatch("[2018-05-11T17:07:29,553][INFO ][o.e.e.NodeEnvironment ] [node-0] " + - "heap size [3.9gb], compressed ordinary object pointers [true]", NOOP_TIMEOUT_CHECKER)); - - assertEquals(new TimestampMatch(23, "192.168.62.101 - - [", "dd/MMM/YYYY:HH:mm:ss Z", "dd/MMM/yyyy:HH:mm:ss XX", - "\\b\\d{2}/[A-Z]\\S{2}/\\d{4}:\\d{2}:\\d{2}:\\d{2} ", "HTTPDATE", - "] \"POST //apiserv:8080/engine/v2/jobs HTTP/1.1\" 201 42 \"-\" \"curl/7.46.0\" 384"), - TimestampFormatFinder.findFirstMatch("192.168.62.101 - - [29/Jun/2016:12:11:31 +0000] " + - "\"POST //apiserv:8080/engine/v2/jobs HTTP/1.1\" 201 42 \"-\" \"curl/7.46.0\" 384", NOOP_TIMEOUT_CHECKER)); - - assertEquals(new TimestampMatch(24, "", "MMM dd, YYYY h:mm:ss a", "MMM dd, yyyy h:mm:ss a", - "\\b[A-Z]\\S{2,8} \\d{1,2}, \\d{4} \\d{1,2}:\\d{2}:\\d{2} [AP]M\\b", "CATALINA_DATESTAMP", - " org.apache.tomcat.util.http.Parameters processParameters"), - TimestampFormatFinder.findFirstMatch("Aug 29, 2009 12:03:57 AM org.apache.tomcat.util.http.Parameters processParameters", - NOOP_TIMEOUT_CHECKER)); - - assertEquals(new TimestampMatch(22, "", Arrays.asList("MMM dd HH:mm:ss", "MMM d HH:mm:ss"), - Arrays.asList("MMM dd HH:mm:ss", "MMM d HH:mm:ss"), - "\\b[A-Z]\\S{2,8} {1,2}\\d{1,2} \\d{2}:\\d{2}:\\d{2}\\b", "SYSLOGTIMESTAMP", " esxi1.acme.com Vpxa: " + - "[3CB3FB90 verbose 'vpxavpxaInvtVm' opID=WFU-33d82c31] [VpxaInvtVmChangeListener] Guest DiskInfo Changed"), - TimestampFormatFinder.findFirstMatch("Oct 19 17:04:44 esxi1.acme.com Vpxa: [3CB3FB90 verbose 'vpxavpxaInvtVm' " + - "opID=WFU-33d82c31] [VpxaInvtVmChangeListener] Guest DiskInfo Changed", NOOP_TIMEOUT_CHECKER)); - - assertEquals(new TimestampMatch(10, "559550912540598297\t", "ISO8601", "ISO8601", "\\b\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}", - "TIMESTAMP_ISO8601", - "\t2016-04-20T21:06:53Z\t38545844\tserv02nw07\t192.168.114.28\tAuthpriv\tInfo\tsshd\tsubsystem request for sftp"), - TimestampFormatFinder.findFirstMatch("559550912540598297\t2016-04-20T14:06:53\t2016-04-20T21:06:53Z\t38545844\tserv02nw07\t" + - "192.168.114.28\tAuthpriv\tInfo\tsshd\tsubsystem request for sftp", NOOP_TIMEOUT_CHECKER)); - - assertEquals(new TimestampMatch(22, "", Arrays.asList("MMM dd HH:mm:ss", "MMM d HH:mm:ss"), - Arrays.asList("MMM dd HH:mm:ss", "MMM d HH:mm:ss"), - "\\b[A-Z]\\S{2,8} {1,2}\\d{1,2} \\d{2}:\\d{2}:\\d{2}\\b", "SYSLOGTIMESTAMP", - " dnsserv named[22529]: error (unexpected RCODE REFUSED) resolving 'www.elastic.co/A/IN': 95.110.68.206#53"), - TimestampFormatFinder.findFirstMatch("Sep 8 11:55:35 dnsserv named[22529]: error (unexpected RCODE REFUSED) resolving " + - "'www.elastic.co/A/IN': 95.110.68.206#53", NOOP_TIMEOUT_CHECKER)); - - assertEquals(new TimestampMatch(3, "", "YYYY-MM-dd HH:mm:ss.SSSSSS", "yyyy-MM-dd HH:mm:ss.SSSSSS", - "\\b\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2}\\.\\d{3}", "TIMESTAMP_ISO8601", - "|INFO |VirtualServer |1 |client 'User1'(id:2) was added to channelgroup 'Channel Admin'(id:5) by client " + - "'User1'(id:2) in channel '3er Instanz'(id:2)"), - TimestampFormatFinder.findFirstMatch("2018-01-06 19:22:20.106822|INFO |VirtualServer |1 |client " + - " 'User1'(id:2) was added to channelgroup 'Channel Admin'(id:5) by client 'User1'(id:2) in channel '3er Instanz'(id:2)", - NOOP_TIMEOUT_CHECKER)); + validateTimestampMatch("2018-05-15 17:14:56,374 +0100", "TOMCAT_DATESTAMP", + "\\b\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2}[:.,]\\d{3}", "yyyy-MM-dd HH:mm:ss,SSS XX", 1526400896374L); + + validateTimestampMatch("Tue May 15 18 16:14:56 UTC", "DATESTAMP_RFC822", + "\\b[A-Z]\\S{2} [A-Z]\\S{2} \\d{1,2} \\d{2} \\d{2}:\\d{2}:\\d{2}\\b", + Arrays.asList("EEE MMM dd yy HH:mm:ss zzz", "EEE MMM d yy HH:mm:ss zzz"), 1526400896000L); + + validateTimestampMatch("Tue, 15 May 2018 17:14:56 +01:00", "DATESTAMP_RFC2822", + "\\b[A-Z]\\S{2}, \\d{1,2} [A-Z]\\S{2} \\d{4} \\d{2}:\\d{2}:\\d{2}\\b", "EEE, dd MMM yyyy HH:mm:ss XXX", 1526400896000L); + validateTimestampMatch("Tue, 15 May 2018 17:14:56 +0100", "DATESTAMP_RFC2822", + "\\b[A-Z]\\S{2}, \\d{1,2} [A-Z]\\S{2} \\d{4} \\d{2}:\\d{2}:\\d{2}\\b", "EEE, dd MMM yyyy HH:mm:ss XX", 1526400896000L); + + validateTimestampMatch("Tue May 15 16:14:56 UTC 2018", "DATESTAMP_OTHER", + "\\b[A-Z]\\S{2,8} [A-Z]\\S{2,8} \\d{1,2} \\d{2}:\\d{2}:\\d{2}\\b", + Arrays.asList("EEE MMM dd HH:mm:ss zzz yyyy", "EEE MMM d HH:mm:ss zzz yyyy"), 1526400896000L); + + validateTimestampMatch("20180515171456", "DATESTAMP_EVENTLOG", "\\b\\d{14}\\b", "yyyyMMddHHmmss", 1526400896000L); + + validateTimestampMatch("Tue May 15 17:14:56 2018", "HTTPDERROR_DATE", + "\\b[A-Z]\\S{2} [A-Z]\\S{2} \\d{2} \\d{2}:\\d{2}:\\d{2} \\d{4}\\b", "EEE MMM dd HH:mm:ss yyyy", 1526400896000L); + + validateTimestampMatch("May 15 17:14:56.725", "SYSLOGTIMESTAMP", "\\b[A-Z]\\S{2,8} {1,2}\\d{1,2} \\d{2}:\\d{2}:\\d{2}\\b", + Arrays.asList("MMM dd HH:mm:ss.SSS", "MMM d HH:mm:ss.SSS"), 1526400896725L); + validateTimestampMatch("May 15 17:14:56", "SYSLOGTIMESTAMP", "\\b[A-Z]\\S{2,8} {1,2}\\d{1,2} \\d{2}:\\d{2}:\\d{2}\\b", + Arrays.asList("MMM dd HH:mm:ss", "MMM d HH:mm:ss"), 1526400896000L); + + validateTimestampMatch("15/May/2018:17:14:56 +0100", "HTTPDATE", "\\b\\d{2}/[A-Z]\\S{2}/\\d{4}:\\d{2}:\\d{2}:\\d{2} ", + "dd/MMM/yyyy:HH:mm:ss XX", 1526400896000L); + + validateTimestampMatch("May 15, 2018 5:14:56 PM", "CATALINA_DATESTAMP", + "\\b[A-Z]\\S{2} \\d{2}, \\d{4} \\d{1,2}:\\d{2}:\\d{2} [AP]M\\b", "MMM dd, yyyy h:mm:ss a", 1526400896000L); + + validateTimestampMatch("May 15 2018 17:14:56", "CISCOTIMESTAMP", "\\b[A-Z]\\S{2} {1,2}\\d{1,2} \\d{4} \\d{2}:\\d{2}:\\d{2}\\b", + Arrays.asList("MMM dd yyyy HH:mm:ss", "MMM d yyyy HH:mm:ss"), 1526400896000L); + + validateTimestampMatch("05/15/2018 17:14:56,374", "DATESTAMP", + "\\b\\d{1,2}[/.-]\\d{1,2}[/.-]\\d{4}[- ]\\d{2}:\\d{2}:\\d{2}\\b", "MM/dd/yyyy HH:mm:ss,SSS", 1526400896374L); + validateTimestampMatch("05-15-2018-17:14:56.374", "DATESTAMP", + "\\b\\d{1,2}[/.-]\\d{1,2}[/.-]\\d{4}[- ]\\d{2}:\\d{2}:\\d{2}\\b", "MM-dd-yyyy-HH:mm:ss.SSS", 1526400896374L); + validateTimestampMatch("15/05/2018 17:14:56.374", "DATESTAMP", + "\\b\\d{1,2}[/.-]\\d{1,2}[/.-]\\d{4}[- ]\\d{2}:\\d{2}:\\d{2}\\b", "dd/MM/yyyy HH:mm:ss.SSS", 1526400896374L); + validateTimestampMatch("15-05-2018-17:14:56,374", "DATESTAMP", + "\\b\\d{1,2}[/.-]\\d{1,2}[/.-]\\d{4}[- ]\\d{2}:\\d{2}:\\d{2}\\b", "dd-MM-yyyy-HH:mm:ss,SSS", 1526400896374L); + validateTimestampMatch("15.05.2018 17:14:56.374", "DATESTAMP", + "\\b\\d{1,2}[/.-]\\d{1,2}[/.-]\\d{4}[- ]\\d{2}:\\d{2}:\\d{2}\\b", "dd.MM.yyyy HH:mm:ss.SSS", 1526400896374L); + validateTimestampMatch("05/15/2018 17:14:56", "DATESTAMP", + "\\b\\d{1,2}[/.-]\\d{1,2}[/.-]\\d{4}[- ]\\d{2}:\\d{2}:\\d{2}\\b", "MM/dd/yyyy HH:mm:ss", 1526400896000L); + validateTimestampMatch("05-15-2018-17:14:56", "DATESTAMP", + "\\b\\d{1,2}[/.-]\\d{1,2}[/.-]\\d{4}[- ]\\d{2}:\\d{2}:\\d{2}\\b", "MM-dd-yyyy-HH:mm:ss", 1526400896000L); + validateTimestampMatch("15/05/2018 17:14:56", "DATESTAMP", + "\\b\\d{1,2}[/.-]\\d{1,2}[/.-]\\d{4}[- ]\\d{2}:\\d{2}:\\d{2}\\b", "dd/MM/yyyy HH:mm:ss", 1526400896000L); + validateTimestampMatch("15-05-2018-17:14:56", "DATESTAMP", + "\\b\\d{1,2}[/.-]\\d{1,2}[/.-]\\d{4}[- ]\\d{2}:\\d{2}:\\d{2}\\b", "dd-MM-yyyy-HH:mm:ss", 1526400896000L); + validateTimestampMatch("15.05.2018 17:14:56", "DATESTAMP", + "\\b\\d{1,2}[/.-]\\d{1,2}[/.-]\\d{4}[- ]\\d{2}:\\d{2}:\\d{2}\\b", "dd.MM.yyyy HH:mm:ss", 1526400896000L); + + validateTimestampMatch("05/15/2018", "DATE", "\\b\\d{1,2}[/.-]\\d{1,2}[/.-]\\d{4}\\b", "MM/dd/yyyy", 1526338800000L); + validateTimestampMatch("05-15-2018", "DATE", "\\b\\d{1,2}[/.-]\\d{1,2}[/.-]\\d{4}\\b", "MM-dd-yyyy", 1526338800000L); + validateTimestampMatch("15/05/2018", "DATE", "\\b\\d{1,2}[/.-]\\d{1,2}[/.-]\\d{4}\\b", "dd/MM/yyyy", 1526338800000L); + validateTimestampMatch("15-05-2018", "DATE", "\\b\\d{1,2}[/.-]\\d{1,2}[/.-]\\d{4}\\b", "dd-MM-yyyy", 1526338800000L); + validateTimestampMatch("15.05.2018", "DATE", "\\b\\d{1,2}[/.-]\\d{1,2}[/.-]\\d{4}\\b", "dd.MM.yyyy", 1526338800000L); + } + + public void testFindFormatGivenOnlySystemDate() { + + validateTimestampMatch("1526400896374", "POSINT", "\\b\\d{13}\\b", "UNIX_MS", 1526400896374L); + + validateTimestampMatch("1526400896.736", "NUMBER", "\\b\\d{10}\\b", "UNIX", 1526400896736L); + validateTimestampMatch("1526400896", "NUMBER", "\\b\\d{10}\\b", "UNIX", 1526400896000L); + + validateTimestampMatch("400000005afb078a164ac980", "BASE16NUM", "\\b[0-9A-Fa-f]{24}\\b", "TAI64N", 1526400896374L); + } + + public void testCustomOverrideMatchingBuiltInFormat() { + + String overrideFormat = "yyyy-MM-dd HH:mm:ss,SSS"; + String text = "2018-05-15 17:14:56,374"; + String expectedSimpleRegex = "\\b\\d{4}-\\d{2}-\\d{2}[T ]\\d{2}:\\d{2}"; + String expectedGrokPatternName = "TIMESTAMP_ISO8601"; + + TimestampFormatFinder strictTimestampFormatFinder = new TimestampFormatFinder(explanation, overrideFormat, true, true, true, + NOOP_TIMEOUT_CHECKER); + strictTimestampFormatFinder.addSample(text); + assertEquals(expectedGrokPatternName, strictTimestampFormatFinder.getGrokPatternName()); + assertEquals(Collections.emptyMap(), strictTimestampFormatFinder.getCustomGrokPatternDefinitions()); + assertEquals(expectedSimpleRegex, strictTimestampFormatFinder.getSimplePattern().pattern()); + assertEquals(Collections.singletonList(overrideFormat), strictTimestampFormatFinder.getJavaTimestampFormats()); + assertEquals(1, strictTimestampFormatFinder.getNumMatchedFormats()); + + TimestampFormatFinder lenientTimestampFormatFinder = new TimestampFormatFinder(explanation, overrideFormat, false, false, false, + NOOP_TIMEOUT_CHECKER); + lenientTimestampFormatFinder.addSample(text); + lenientTimestampFormatFinder.selectBestMatch(); + assertEquals(expectedGrokPatternName, lenientTimestampFormatFinder.getGrokPatternName()); + assertEquals(Collections.emptyMap(), lenientTimestampFormatFinder.getCustomGrokPatternDefinitions()); + assertEquals(expectedSimpleRegex, lenientTimestampFormatFinder.getSimplePattern().pattern()); + assertEquals(Collections.singletonList(overrideFormat), lenientTimestampFormatFinder.getJavaTimestampFormats()); + assertEquals(1, lenientTimestampFormatFinder.getNumMatchedFormats()); + } + + public void testCustomOverrideNotMatchingBuiltInFormat() { + + String overrideFormat = "MM/dd HH.mm.ss,SSSSSS 'in' yyyy"; + String text = "05/15 17.14.56,374946 in 2018"; + String expectedSimpleRegex = "\\b\\d{2}/\\d{2} \\d{2}\\.\\d{2}\\.\\d{2},\\d{6} in \\d{4}\\b"; + String expectedGrokPatternName = "CUSTOM_TIMESTAMP"; + Map expectedCustomGrokPatternDefinitions = + Collections.singletonMap(TimestampFormatFinder.CUSTOM_TIMESTAMP_GROK_NAME, + "%{MONTHNUM2}/%{MONTHDAY} %{HOUR}\\.%{MINUTE}\\.%{SECOND} in %{YEAR}"); + + TimestampFormatFinder strictTimestampFormatFinder = new TimestampFormatFinder(explanation, overrideFormat, true, true, true, + NOOP_TIMEOUT_CHECKER); + strictTimestampFormatFinder.addSample(text); + assertEquals(expectedGrokPatternName, strictTimestampFormatFinder.getGrokPatternName()); + assertEquals(expectedCustomGrokPatternDefinitions, strictTimestampFormatFinder.getCustomGrokPatternDefinitions()); + assertEquals(expectedSimpleRegex, strictTimestampFormatFinder.getSimplePattern().pattern()); + assertEquals(Collections.singletonList(overrideFormat), strictTimestampFormatFinder.getJavaTimestampFormats()); + assertEquals(1, strictTimestampFormatFinder.getNumMatchedFormats()); + + TimestampFormatFinder lenientTimestampFormatFinder = new TimestampFormatFinder(explanation, overrideFormat, false, false, false, + NOOP_TIMEOUT_CHECKER); + lenientTimestampFormatFinder.addSample(text); + lenientTimestampFormatFinder.selectBestMatch(); + assertEquals(expectedGrokPatternName, lenientTimestampFormatFinder.getGrokPatternName()); + assertEquals(expectedCustomGrokPatternDefinitions, lenientTimestampFormatFinder.getCustomGrokPatternDefinitions()); + assertEquals(expectedSimpleRegex, lenientTimestampFormatFinder.getSimplePattern().pattern()); + assertEquals(Collections.singletonList(overrideFormat), lenientTimestampFormatFinder.getJavaTimestampFormats()); + assertEquals(1, lenientTimestampFormatFinder.getNumMatchedFormats()); + } + + public void testFindFormatGivenRealLogMessages() { + + validateFindInFullMessage("[2018-05-11T17:07:29,553][INFO ][o.e.e.NodeEnvironment ] [node-0] " + + "heap size [3.9gb], compressed ordinary object pointers [true]", "[", "TIMESTAMP_ISO8601", + "\\b\\d{4}-\\d{2}-\\d{2}[T ]\\d{2}:\\d{2}", "ISO8601"); + + validateFindInFullMessage("192.168.62.101 - - [29/Jun/2016:12:11:31 +0000] " + + "\"POST //apiserv:8080/engine/v2/jobs HTTP/1.1\" 201 42 \"-\" \"curl/7.46.0\" 384", "192.168.62.101 - - [", "HTTPDATE", + "\\b\\d{2}/[A-Z]\\S{2}/\\d{4}:\\d{2}:\\d{2}:\\d{2} ", "dd/MMM/yyyy:HH:mm:ss XX"); + + validateFindInFullMessage("Aug 29, 2009 12:03:57 AM org.apache.tomcat.util.http.Parameters processParameters", "", + "CATALINA_DATESTAMP", "\\b[A-Z]\\S{2} \\d{2}, \\d{4} \\d{1,2}:\\d{2}:\\d{2} [AP]M\\b", "MMM dd, yyyy h:mm:ss a"); + + validateFindInFullMessage("Oct 19 17:04:44 esxi1.acme.com Vpxa: [3CB3FB90 verbose 'vpxavpxaInvtVm' " + + "opID=WFU-33d82c31] [VpxaInvtVmChangeListener] Guest DiskInfo Changed", "", "SYSLOGTIMESTAMP", + "\\b[A-Z]\\S{2,8} {1,2}\\d{1,2} \\d{2}:\\d{2}:\\d{2}\\b", Arrays.asList("MMM dd HH:mm:ss", "MMM d HH:mm:ss")); + + validateFindInFullMessage("559550912540598297\t2016-04-20T14:06:53\t2016-04-20T21:06:53Z\t38545844\tserv02nw07\t" + + "192.168.114.28\tAuthpriv\tInfo\tsshd\tsubsystem request for sftp", "559550912540598297\t", "TIMESTAMP_ISO8601", + "\\b\\d{4}-\\d{2}-\\d{2}[T ]\\d{2}:\\d{2}", "ISO8601"); + + validateFindInFullMessage("Sep 8 11:55:35 dnsserv named[22529]: error (unexpected RCODE REFUSED) resolving " + + "'www.elastic.co/A/IN': 95.110.68.206#53", "", "SYSLOGTIMESTAMP", "\\b[A-Z]\\S{2,8} {1,2}\\d{1,2} \\d{2}:\\d{2}:\\d{2}\\b", + Arrays.asList("MMM dd HH:mm:ss", "MMM d HH:mm:ss")); + + validateFindInFullMessage("10-28-2016 16:22:47.636 +0200 ERROR Network - " + + "Error encountered for connection from src=192.168.0.1:12345. Local side shutting down", "", "DATESTAMP", + "\\b\\d{1,2}[/.-]\\d{1,2}[/.-]\\d{4}[- ]\\d{2}:\\d{2}:\\d{2}\\b", "MM-dd-yyyy HH:mm:ss.SSS"); + + validateFindInFullMessage("2018-01-06 19:22:20.106822|INFO |VirtualServer |1 |client " + + " 'User1'(id:2) was added to channelgroup 'Channel Admin'(id:5) by client 'User1'(id:2) in channel '3er Instanz'(id:2)", "", + "TIMESTAMP_ISO8601", "\\b\\d{4}-\\d{2}-\\d{2}[T ]\\d{2}:\\d{2}", "yyyy-MM-dd HH:mm:ss.SSSSSS"); // Differs from the above as the required format is specified - assertEquals(new TimestampMatch(3, "", "YYYY-MM-dd HH:mm:ss.SSSSSS", "yyyy-MM-dd HH:mm:ss.SSSSSS", - "\\b\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2}\\.\\d{3}", "TIMESTAMP_ISO8601", - "|INFO |VirtualServer |1 |client 'User1'(id:2) was added to channelgroup 'Channel Admin'(id:5) by client " + - "'User1'(id:2) in channel '3er Instanz'(id:2)"), - TimestampFormatFinder.findFirstMatch("2018-01-06 19:22:20.106822|INFO |VirtualServer |1 |client " + - " 'User1'(id:2) was added to channelgroup 'Channel Admin'(id:5) by client 'User1'(id:2) in channel '3er Instanz'(id:2)", - randomFrom("YYYY-MM-dd HH:mm:ss.SSSSSS", "yyyy-MM-dd HH:mm:ss.SSSSSS"), NOOP_TIMEOUT_CHECKER)); + validateFindInFullMessage("yyyy-MM-dd HH:mm:ss.SSSSSS", "2018-01-06 19:22:20.106822|INFO |VirtualServer |1 |client " + + " 'User1'(id:2) was added to channelgroup 'Channel Admin'(id:5) by client 'User1'(id:2) in channel '3er Instanz'(id:2)", "", + "TIMESTAMP_ISO8601", "\\b\\d{4}-\\d{2}-\\d{2}[T ]\\d{2}:\\d{2}", "yyyy-MM-dd HH:mm:ss.SSSSSS"); // Non-matching required format specified - assertNull(TimestampFormatFinder.findFirstMatch("2018-01-06 19:22:20.106822|INFO |VirtualServer |1 |client " + - " 'User1'(id:2) was added to channelgroup 'Channel Admin'(id:5) by client 'User1'(id:2) in channel '3er Instanz'(id:2)", - randomFrom("UNIX", "EEE MMM dd YYYY HH:mm zzz"), NOOP_TIMEOUT_CHECKER)); - } - - public void testAdjustRequiredFormat() { - assertEquals("YYYY-MM-dd HH:mm:ss,SSS Z", TimestampFormatFinder.adjustRequiredFormat("YYYY-MM-dd HH:mm:ss,SSS Z")); - assertEquals("YYYY-MM-dd HH:mm:ss,SSS Z", TimestampFormatFinder.adjustRequiredFormat("YYYY-MM-dd HH:mm:ss,SSSSSS Z")); - assertEquals("YYYY-MM-dd HH:mm:ss,SSS Z", TimestampFormatFinder.adjustRequiredFormat("YYYY-MM-dd HH:mm:ss,SSSSSSSSS Z")); - assertEquals("YYYY-MM-dd HH:mm:ss,SSS Z", TimestampFormatFinder.adjustRequiredFormat("YYYY-MM-dd HH:mm:ss.SSS Z")); - assertEquals("YYYY-MM-dd HH:mm:ss,SSS Z", TimestampFormatFinder.adjustRequiredFormat("YYYY-MM-dd HH:mm:ss.SSSSSS Z")); - assertEquals("YYYY-MM-dd HH:mm:ss,SSS Z", TimestampFormatFinder.adjustRequiredFormat("YYYY-MM-dd HH:mm:ss.SSSSSSSSS Z")); - assertEquals("YYYY-MM-dd HH:mm:ss,SSS", TimestampFormatFinder.adjustRequiredFormat("YYYY-MM-dd HH:mm:ss,SSS")); - assertEquals("YYYY-MM-dd HH:mm:ss,SSS", TimestampFormatFinder.adjustRequiredFormat("YYYY-MM-dd HH:mm:ss,SSSSSS")); - assertEquals("YYYY-MM-dd HH:mm:ss,SSS", TimestampFormatFinder.adjustRequiredFormat("YYYY-MM-dd HH:mm:ss,SSSSSSSSS")); - assertEquals("YYYY-MM-dd HH:mm:ss,SSS", TimestampFormatFinder.adjustRequiredFormat("YYYY-MM-dd HH:mm:ss.SSS")); - assertEquals("YYYY-MM-dd HH:mm:ss,SSS", TimestampFormatFinder.adjustRequiredFormat("YYYY-MM-dd HH:mm:ss.SSSSSS")); - assertEquals("YYYY-MM-dd HH:mm:ss,SSS", TimestampFormatFinder.adjustRequiredFormat("YYYY-MM-dd HH:mm:ss.SSSSSSSSS")); - } - - public void testInterpretFractionalSeconds() { - assertEquals(new Tuple<>(',', 0), TimestampFormatFinder.interpretFractionalSeconds("Sep 8 11:55:35")); - assertEquals(new Tuple<>(',', 0), TimestampFormatFinder.interpretFractionalSeconds("29/Jun/2016:12:11:31 +0000")); - assertEquals(new Tuple<>('.', 6), TimestampFormatFinder.interpretFractionalSeconds("2018-01-06 17:21:25.764368")); - assertEquals(new Tuple<>(',', 9), TimestampFormatFinder.interpretFractionalSeconds("2018-01-06T17:21:25,764363438")); - assertEquals(new Tuple<>(',', 3), TimestampFormatFinder.interpretFractionalSeconds("2018-01-06T17:21:25,764")); - assertEquals(new Tuple<>('.', 3), TimestampFormatFinder.interpretFractionalSeconds("2018-01-06T17:21:25.764")); - assertEquals(new Tuple<>('.', 6), TimestampFormatFinder.interpretFractionalSeconds("2018-01-06 17:21:25.764368Z")); - assertEquals(new Tuple<>(',', 9), TimestampFormatFinder.interpretFractionalSeconds("2018-01-06T17:21:25,764363438Z")); - assertEquals(new Tuple<>(',', 3), TimestampFormatFinder.interpretFractionalSeconds("2018-01-06T17:21:25,764Z")); - assertEquals(new Tuple<>('.', 3), TimestampFormatFinder.interpretFractionalSeconds("2018-01-06T17:21:25.764Z")); - assertEquals(new Tuple<>('.', 6), TimestampFormatFinder.interpretFractionalSeconds("2018-01-06 17:21:25.764368 Z")); - assertEquals(new Tuple<>(',', 9), TimestampFormatFinder.interpretFractionalSeconds("2018-01-06T17:21:25,764363438 Z")); - assertEquals(new Tuple<>(',', 3), TimestampFormatFinder.interpretFractionalSeconds("2018-01-06T17:21:25,764 Z")); - assertEquals(new Tuple<>('.', 3), TimestampFormatFinder.interpretFractionalSeconds("2018-01-06T17:21:25.764 Z")); - } - - private void validateTimestampMatch(TimestampMatch expected, String text, long expectedEpochMs) { - - assertEquals(expected, TimestampFormatFinder.findFirstMatch(text, NOOP_TIMEOUT_CHECKER)); - assertEquals(expected, TimestampFormatFinder.findFirstFullMatch(text, NOOP_TIMEOUT_CHECKER)); - assertEquals(expected, TimestampFormatFinder.findFirstMatch(text, expected.candidateIndex, NOOP_TIMEOUT_CHECKER)); - assertEquals(expected, TimestampFormatFinder.findFirstFullMatch(text, expected.candidateIndex, NOOP_TIMEOUT_CHECKER)); - assertNull(TimestampFormatFinder.findFirstMatch(text, Integer.MAX_VALUE, NOOP_TIMEOUT_CHECKER)); - assertNull(TimestampFormatFinder.findFirstFullMatch(text, Integer.MAX_VALUE, NOOP_TIMEOUT_CHECKER)); - assertEquals(expected, TimestampFormatFinder.findFirstMatch(text, randomFrom(expected.jodaTimestampFormats), NOOP_TIMEOUT_CHECKER)); - assertEquals(expected, TimestampFormatFinder.findFirstFullMatch(text, randomFrom(expected.jodaTimestampFormats), - NOOP_TIMEOUT_CHECKER)); - assertEquals(expected, TimestampFormatFinder.findFirstMatch(text, randomFrom(expected.javaTimestampFormats), NOOP_TIMEOUT_CHECKER)); - assertEquals(expected, TimestampFormatFinder.findFirstFullMatch(text, randomFrom(expected.javaTimestampFormats), - NOOP_TIMEOUT_CHECKER)); - assertNull(TimestampFormatFinder.findFirstMatch(text, "wrong format", NOOP_TIMEOUT_CHECKER)); - assertNull(TimestampFormatFinder.findFirstFullMatch(text, "wrong format", NOOP_TIMEOUT_CHECKER)); - - validateJodaTimestampFormats(expected.jodaTimestampFormats, text, expectedEpochMs); - validateJavaTimestampFormats(expected.javaTimestampFormats, text, expectedEpochMs); - - assertTrue(expected.simplePattern.matcher(text).find()); - } - - // This is because parsing timestamps using Joda formats generates warnings. - // Eventually we'll probably just remove the checks that the Joda formats - // are valid, and at that point this method can be removed too. - protected boolean enableWarningsCheck() { - return false; - } - - // This method is using the Joda BWC layer. When that's removed, this method - // can be deleted - we'll just validate the Java time formats after that. - // Also remove enableWarningsCheck() above if this method is removed. - private void validateJodaTimestampFormats(List jodaTimestampFormats, String text, long expectedEpochMs) { + TimestampFormatFinder timestampFormatFinder = new TimestampFormatFinder(explanation, + randomFrom("UNIX", "EEE MMM dd yyyy HH:mm zzz"), false, false, false, NOOP_TIMEOUT_CHECKER); + timestampFormatFinder.addSample("2018-01-06 19:22:20.106822|INFO |VirtualServer |1 |client " + + " 'User1'(id:2) was added to channelgroup 'Channel Admin'(id:5) by client 'User1'(id:2) in channel '3er Instanz'(id:2)"); + assertEquals(Collections.emptyList(), timestampFormatFinder.getJavaTimestampFormats()); + assertEquals(0, timestampFormatFinder.getNumMatchedFormats()); + } - // All the test times are for Tue May 15 2018 16:14:56 UTC, which is 17:14:56 in London. - // This is the timezone that will be used for any text representations that don't include it. - ZoneId defaultZone = ZoneId.of("Europe/London"); - long actualEpochMs; - for (int i = 0; i < jodaTimestampFormats.size(); ++i) { - try { - String timestampFormat = jodaTimestampFormats.get(i); - switch (timestampFormat) { - case "ISO8601": - actualEpochMs = Joda.forPattern("date_optional_time").withZone(defaultZone).parseMillis(text); - break; - default: - actualEpochMs = Joda.forPattern(timestampFormat).withYear(2018).withZone(defaultZone).parseMillis(text); - break; - } - if (expectedEpochMs == actualEpochMs) { - break; - } - // If the last one isn't right then propagate - if (i == jodaTimestampFormats.size() - 1) { - assertEquals(expectedEpochMs, actualEpochMs); - } - } catch (RuntimeException e) { - // If the last one throws then propagate - if (i == jodaTimestampFormats.size() - 1) { - throw e; - } - } + + public void testSelectBestMatchGivenAllSame() { + String sample = "[2018-06-27T11:59:22,125][INFO ][o.e.n.Node ] [node-0] initializing ...\n" + + "[2018-06-27T11:59:22,201][INFO ][o.e.e.NodeEnvironment ] [node-0] using [1] data paths, mounts [[/ (/dev/disk1)]], " + + "net usable_space [216.1gb], net total_space [464.7gb], types [hfs]\n" + + "[2018-06-27T11:59:22,202][INFO ][o.e.e.NodeEnvironment ] [node-0] heap size [494.9mb], " + + "compressed ordinary object pointers [true]\n" + + "[2018-06-27T11:59:22,204][INFO ][o.e.n.Node ] [node-0] node name [node-0], node ID [Ha1gD8nNSDqjd6PIyu3DJA]\n" + + "[2018-06-27T11:59:22,204][INFO ][o.e.n.Node ] [node-0] version[6.4.0-SNAPSHOT], pid[2785], " + + "build[default/zip/3c60efa/2018-06-26T14:55:15.206676Z], OS[Mac OS X/10.12.6/x86_64], " + + "JVM[\"Oracle Corporation\"/Java HotSpot(TM) 64-Bit Server VM/10/10+46]\n" + + "[2018-06-27T11:59:22,205][INFO ][o.e.n.Node ] [node-0] JVM arguments [-Xms1g, -Xmx1g, " + + "-XX:+UseConcMarkSweepGC, -XX:CMSInitiatingOccupancyFraction=75, -XX:+UseCMSInitiatingOccupancyOnly, " + + "-XX:+AlwaysPreTouch, -Xss1m, -Djava.awt.headless=true, -Dfile.encoding=UTF-8, -Djna.nosys=true, " + + "-XX:-OmitStackTraceInFastThrow, -Dio.netty.noUnsafe=true, -Dio.netty.noKeySetOptimization=true, " + + "-Dio.netty.recycler.maxCapacityPerThread=0, -Dlog4j.shutdownHookEnabled=false, -Dlog4j2.disable.jmx=true, " + + "-Djava.io.tmpdir=/var/folders/k5/5sqcdlps5sg3cvlp783gcz740000h0/T/elasticsearch.nFUyeMH1, " + + "-XX:+HeapDumpOnOutOfMemoryError, -XX:HeapDumpPath=data, -XX:ErrorFile=logs/hs_err_pid%p.log, " + + "-Xlog:gc*,gc+age=trace,safepoint:file=logs/gc.log:utctime,pid,tags:filecount=32,filesize=64m, " + + "-Djava.locale.providers=COMPAT, -Dio.netty.allocator.type=unpooled, -ea, -esa, -Xms512m, -Xmx512m, " + + "-Des.path.home=/Users/dave/elasticsearch/distribution/build/cluster/run node0/elasticsearch-6.4.0-SNAPSHOT, " + + "-Des.path.conf=/Users/dave/elasticsearch/distribution/build/cluster/run node0/elasticsearch-6.4.0-SNAPSHOT/config, " + + "-Des.distribution.flavor=default, -Des.distribution.type=zip]\n" + + "[2018-06-27T11:59:22,205][WARN ][o.e.n.Node ] [node-0] version [6.4.0-SNAPSHOT] is a pre-release version of " + + "Elasticsearch and is not suitable for production\n" + + "[2018-06-27T11:59:23,585][INFO ][o.e.p.PluginsService ] [node-0] loaded module [aggs-matrix-stats]\n" + + "[2018-06-27T11:59:23,586][INFO ][o.e.p.PluginsService ] [node-0] loaded module [analysis-common]\n" + + "[2018-06-27T11:59:23,586][INFO ][o.e.p.PluginsService ] [node-0] loaded module [ingest-common]\n" + + "[2018-06-27T11:59:23,586][INFO ][o.e.p.PluginsService ] [node-0] loaded module [lang-expression]\n" + + "[2018-06-27T11:59:23,586][INFO ][o.e.p.PluginsService ] [node-0] loaded module [lang-mustache]\n" + + "[2018-06-27T11:59:23,586][INFO ][o.e.p.PluginsService ] [node-0] loaded module [lang-painless]\n" + + "[2018-06-27T11:59:23,586][INFO ][o.e.p.PluginsService ] [node-0] loaded module [mapper-extras]\n" + + "[2018-06-27T11:59:23,586][INFO ][o.e.p.PluginsService ] [node-0] loaded module [parent-join]\n" + + "[2018-06-27T11:59:23,586][INFO ][o.e.p.PluginsService ] [node-0] loaded module [percolator]\n" + + "[2018-06-27T11:59:23,586][INFO ][o.e.p.PluginsService ] [node-0] loaded module [rank-eval]\n" + + "[2018-06-27T11:59:23,586][INFO ][o.e.p.PluginsService ] [node-0] loaded module [reindex]\n" + + "[2018-06-27T11:59:23,586][INFO ][o.e.p.PluginsService ] [node-0] loaded module [repository-url]\n" + + "[2018-06-27T11:59:23,587][INFO ][o.e.p.PluginsService ] [node-0] loaded module [transport-netty4]\n" + + "[2018-06-27T11:59:23,587][INFO ][o.e.p.PluginsService ] [node-0] loaded module [x-pack-core]\n" + + "[2018-06-27T11:59:23,587][INFO ][o.e.p.PluginsService ] [node-0] loaded module [x-pack-deprecation]\n" + + "[2018-06-27T11:59:23,587][INFO ][o.e.p.PluginsService ] [node-0] loaded module [x-pack-graph]\n" + + "[2018-06-27T11:59:23,587][INFO ][o.e.p.PluginsService ] [node-0] loaded module [x-pack-logstash]\n" + + "[2018-06-27T11:59:23,587][INFO ][o.e.p.PluginsService ] [node-0] loaded module [x-pack-ml]\n" + + "[2018-06-27T11:59:23,587][INFO ][o.e.p.PluginsService ] [node-0] loaded module [x-pack-monitoring]\n" + + "[2018-06-27T11:59:23,587][INFO ][o.e.p.PluginsService ] [node-0] loaded module [x-pack-rollup]\n" + + "[2018-06-27T11:59:23,587][INFO ][o.e.p.PluginsService ] [node-0] loaded module [x-pack-security]\n" + + "[2018-06-27T11:59:23,587][INFO ][o.e.p.PluginsService ] [node-0] loaded module [x-pack-sql]\n" + + "[2018-06-27T11:59:23,588][INFO ][o.e.p.PluginsService ] [node-0] loaded module [x-pack-upgrade]\n" + + "[2018-06-27T11:59:23,588][INFO ][o.e.p.PluginsService ] [node-0] loaded module [x-pack-watcher]\n" + + "[2018-06-27T11:59:23,588][INFO ][o.e.p.PluginsService ] [node-0] no plugins loaded\n"; + + TimestampFormatFinder timestampFormatFinder = TextLogFileStructureFinder.populateTimestampFormatFinder(explanation, + sample.split("\n"), FileStructureOverrides.EMPTY_OVERRIDES, NOOP_TIMEOUT_CHECKER); + timestampFormatFinder.selectBestMatch(); + assertEquals(Collections.singletonList("ISO8601"), timestampFormatFinder.getJavaTimestampFormats()); + assertEquals("TIMESTAMP_ISO8601", timestampFormatFinder.getGrokPatternName()); + assertEquals("\\b\\d{4}-\\d{2}-\\d{2}[T ]\\d{2}:\\d{2}", timestampFormatFinder.getSimplePattern().pattern()); + for (String preface : timestampFormatFinder.getPrefaces()) { + assertEquals("[", preface); + } + assertEquals(1, timestampFormatFinder.getNumMatchedFormats()); + } + + public void testSelectBestMatchGivenExceptionTrace() { + + TimestampFormatFinder timestampFormatFinder = TextLogFileStructureFinder.populateTimestampFormatFinder(explanation, + EXCEPTION_TRACE_SAMPLE.split("\n"), FileStructureOverrides.EMPTY_OVERRIDES, NOOP_TIMEOUT_CHECKER); + + // Even though many lines have a timestamp near the end (in the Lucene version information), + // these are so far along the lines that the weight of the timestamp near the beginning of the + // first line should take precedence + timestampFormatFinder.selectBestMatch(); + assertEquals(Collections.singletonList("ISO8601"), timestampFormatFinder.getJavaTimestampFormats()); + assertEquals("TIMESTAMP_ISO8601", timestampFormatFinder.getGrokPatternName()); + assertEquals("\\b\\d{4}-\\d{2}-\\d{2}[T ]\\d{2}:\\d{2}", timestampFormatFinder.getSimplePattern().pattern()); + for (String preface : timestampFormatFinder.getPrefaces()) { + assertEquals("[", preface); } + assertEquals(2, timestampFormatFinder.getNumMatchedFormats()); + } + + public void testSelectBestMatchGivenExceptionTraceAndTimestampFormatOverride() { + + FileStructureOverrides overrides = FileStructureOverrides.builder().setTimestampFormat("yyyy-MM-dd HH:mm:ss").build(); + + TimestampFormatFinder timestampFormatFinder = TextLogFileStructureFinder.populateTimestampFormatFinder(explanation, + EXCEPTION_TRACE_SAMPLE.split("\n"), overrides, NOOP_TIMEOUT_CHECKER); + + // The override should force the seemingly inferior choice of timestamp + // TODO - this won't work any more :-( + } + + public void testSelectBestMatchGivenExceptionTraceAndImpossibleTimestampFormatOverride() { + + FileStructureOverrides overrides = FileStructureOverrides.builder().setTimestampFormat("MMM dd HH:mm:ss").build(); + + TimestampFormatFinder timestampFormatFinder = TextLogFileStructureFinder.populateTimestampFormatFinder(explanation, + EXCEPTION_TRACE_SAMPLE.split("\n"), overrides, NOOP_TIMEOUT_CHECKER); + + timestampFormatFinder.selectBestMatch(); + assertEquals(Collections.emptyList(), timestampFormatFinder.getJavaTimestampFormats()); + assertNull(timestampFormatFinder.getGrokPatternName()); + assertNull(timestampFormatFinder.getSimplePattern()); + assertEquals(Collections.emptyList(), timestampFormatFinder.getPrefaces()); + assertEquals(0, timestampFormatFinder.getNumMatchedFormats()); + } + + private void validateNoTimestampMatch(String text) { + + TimestampFormatFinder strictTimestampFormatFinder = new TimestampFormatFinder(explanation, true, true, true, NOOP_TIMEOUT_CHECKER); + expectThrows(IllegalArgumentException.class, () -> strictTimestampFormatFinder.addSample(text)); + assertEquals(0, strictTimestampFormatFinder.getNumMatchedFormats()); + + TimestampFormatFinder lenientTimestampFormatFinder = new TimestampFormatFinder(explanation, false, false, false, + NOOP_TIMEOUT_CHECKER); + lenientTimestampFormatFinder.addSample(text); + lenientTimestampFormatFinder.selectBestMatch(); + assertNull(lenientTimestampFormatFinder.getGrokPatternName()); + assertEquals(0, lenientTimestampFormatFinder.getNumMatchedFormats()); + } + + private void validateTimestampMatch(String text, String expectedGrokPatternName, String expectedSimpleRegex, + String expectedJavaTimestampFormat, long expectedEpochMs) { + validateTimestampMatch(text, expectedGrokPatternName, expectedSimpleRegex, Collections.singletonList(expectedJavaTimestampFormat), + expectedEpochMs); + } + + private void validateTimestampMatch(String text, String expectedGrokPatternName, String expectedSimpleRegex, + List expectedJavaTimestampFormats, long expectedEpochMs) { + + Pattern expectedSimplePattern = Pattern.compile(expectedSimpleRegex); + assertTrue(expectedSimplePattern.matcher(text).find()); + validateJavaTimestampFormats(expectedJavaTimestampFormats, text, expectedEpochMs); + + TimestampFormatFinder strictTimestampFormatFinder = new TimestampFormatFinder(explanation, true, true, true, NOOP_TIMEOUT_CHECKER); + strictTimestampFormatFinder.addSample(text); + assertEquals(expectedGrokPatternName, strictTimestampFormatFinder.getGrokPatternName()); + assertEquals(expectedSimplePattern.pattern(), strictTimestampFormatFinder.getSimplePattern().pattern()); + assertEquals(expectedJavaTimestampFormats, strictTimestampFormatFinder.getJavaTimestampFormats()); + assertEquals(1, strictTimestampFormatFinder.getNumMatchedFormats()); + + TimestampFormatFinder lenientTimestampFormatFinder = new TimestampFormatFinder(explanation, false, false, false, + NOOP_TIMEOUT_CHECKER); + lenientTimestampFormatFinder.addSample(text); + lenientTimestampFormatFinder.selectBestMatch(); + assertEquals(expectedGrokPatternName, lenientTimestampFormatFinder.getGrokPatternName()); + assertEquals(expectedSimplePattern.pattern(), lenientTimestampFormatFinder.getSimplePattern().pattern()); + assertEquals(expectedJavaTimestampFormats, lenientTimestampFormatFinder.getJavaTimestampFormats()); + assertEquals(1, lenientTimestampFormatFinder.getNumMatchedFormats()); + } + + private void validateFindInFullMessage(String message, String expectedPreface, String expectedGrokPatternName, + String expectedSimpleRegex, String expectedJavaTimestampFormat) { + validateFindInFullMessage(message, expectedPreface, expectedGrokPatternName, expectedSimpleRegex, + Collections.singletonList(expectedJavaTimestampFormat)); + } + + private void validateFindInFullMessage(String timestampFormatOverride, String message, String expectedPreface, + String expectedGrokPatternName, String expectedSimpleRegex, + String expectedJavaTimestampFormat) { + validateFindInFullMessage(timestampFormatOverride, message, expectedPreface, expectedGrokPatternName, expectedSimpleRegex, + Collections.singletonList(expectedJavaTimestampFormat)); + } + + private void validateFindInFullMessage(String message, String expectedPreface, String expectedGrokPatternName, + String expectedSimpleRegex, List expectedJavaTimestampFormats) { + validateFindInFullMessage(null, message, expectedPreface, expectedGrokPatternName, expectedSimpleRegex, + expectedJavaTimestampFormats); + } + + private void validateFindInFullMessage(String timestampFormatOverride, String message, String expectedPreface, + String expectedGrokPatternName, String expectedSimpleRegex, + List expectedJavaTimestampFormats) { + + Pattern expectedSimplePattern = Pattern.compile(expectedSimpleRegex); + assertTrue(expectedSimplePattern.matcher(message).find()); + + TimestampFormatFinder timestampFormatFinder = new TimestampFormatFinder(explanation, timestampFormatOverride, false, false, false, + NOOP_TIMEOUT_CHECKER); + timestampFormatFinder.addSample(message); + timestampFormatFinder.selectBestMatch(); + assertEquals(expectedGrokPatternName, timestampFormatFinder.getGrokPatternName()); + assertEquals(expectedSimplePattern.pattern(), timestampFormatFinder.getSimplePattern().pattern()); + assertEquals(expectedJavaTimestampFormats, timestampFormatFinder.getJavaTimestampFormats()); + assertEquals(Collections.singletonList(expectedPreface), timestampFormatFinder.getPrefaces()); + assertEquals(1, timestampFormatFinder.getNumMatchedFormats()); } private void validateJavaTimestampFormats(List javaTimestampFormats, String text, long expectedEpochMs) { @@ -325,18 +998,35 @@ private void validateJavaTimestampFormats(List javaTimestampFormats, Str // All the test times are for Tue May 15 2018 16:14:56 UTC, which is 17:14:56 in London. // This is the timezone that will be used for any text representations that don't include it. ZoneId defaultZone = ZoneId.of("Europe/London"); - TemporalAccessor parsed; + long actualEpochMs; for (int i = 0; i < javaTimestampFormats.size(); ++i) { try { String timestampFormat = javaTimestampFormats.get(i); switch (timestampFormat) { case "ISO8601": - parsed = DateFormatter.forPattern("strict_date_optional_time_nanos").withZone(defaultZone).parse(text); + actualEpochMs = DateFormatter.forPattern("iso8601").withZone(defaultZone).parseMillis(text); + break; + case "UNIX_MS": + actualEpochMs = Long.parseLong(text); + break; + case "UNIX": + actualEpochMs = (long) (Double.parseDouble(text) * 1000.0); + break; + case "TAI64N": + actualEpochMs = parseMillisFromTai64n(text); break; default: - DateTimeFormatter parser = new DateTimeFormatterBuilder() - .appendPattern(timestampFormat).parseDefaulting(ChronoField.YEAR_OF_ERA, 2018) - .toFormatter(Locale.ROOT); + DateTimeFormatterBuilder builder = new DateTimeFormatterBuilder().appendPattern(timestampFormat); + if (timestampFormat.indexOf('y') == -1) { + builder.parseDefaulting(ChronoField.YEAR_OF_ERA, 2018); + } + if (timestampFormat.indexOf('m') == -1) { + // All formats tested have either both or neither of hour and minute + builder.parseDefaulting(ChronoField.HOUR_OF_DAY, 0); + builder.parseDefaulting(ChronoField.MINUTE_OF_HOUR, 0); + // Seconds automatically defaults to 0 + } + DateTimeFormatter parser = builder.toFormatter(Locale.ROOT); // This next line parses the textual date without any default timezone, so if // the text doesn't contain the timezone then the resulting temporal accessor // will be incomplete (i.e. impossible to convert to an Instant). You would @@ -346,15 +1036,15 @@ private void validateJavaTimestampFormats(List javaTimestampFormats, Str // from the text. The solution is to parse twice, once without a default // timezone and then again with a default timezone if the first parse didn't // find one in the text. - parsed = parser.parse(text); + TemporalAccessor parsed = parser.parse(text); if (parsed.query(TemporalQueries.zone()) == null) { // TODO: when Java 8 is no longer supported remove the two // lines and comment above and the closing brace below parsed = parser.withZone(defaultZone).parse(text); } + actualEpochMs = Instant.from(parsed).toEpochMilli(); break; } - long actualEpochMs = Instant.from(parsed).toEpochMilli(); if (expectedEpochMs == actualEpochMs) { break; } @@ -370,4 +1060,17 @@ private void validateJavaTimestampFormats(List javaTimestampFormats, Str } } } + + /** + * Logic copied from {@code org.elasticsearch.ingest.common.DateFormat.Tai64n.parseMillis}. + */ + private long parseMillisFromTai64n(String tai64nDate) { + if (tai64nDate.startsWith("@")) { + tai64nDate = tai64nDate.substring(1); + } + assertEquals(24, tai64nDate.length()); + long seconds = Long.parseLong(tai64nDate.substring(1, 16), 16); + long nanos = Long.parseLong(tai64nDate.substring(16, 24), 16); + return (seconds * 1000) - 10000 + nanos / 1000000; + } }