Skip to content

Commit

Permalink
New GrokPatternBank data structure (#95269)
Browse files Browse the repository at this point in the history
This refactor introduces a new data structure called `PatternBank` which is an abstraction over the old `Map<String, String>` used all over the place. This data structure has handy methods to extend the pattern bank with new patterns and also centralize the validation of pattern banks into one place. Thanks to this, the repeated code to create Grok Pattern banks is 0.
---------

Co-authored-by: Joe Gallo <joe.gallo@elastic.co>
  • Loading branch information
HiDAl and joegallo committed Apr 20, 2023
1 parent e560b81 commit b51951f
Show file tree
Hide file tree
Showing 12 changed files with 331 additions and 258 deletions.
88 changes: 6 additions & 82 deletions libs/grok/src/main/java/org/elasticsearch/grok/Grok.java
Original file line number Diff line number Diff line change
Expand Up @@ -51,38 +51,34 @@ public final class Grok {

private static final int MAX_TO_REGEX_ITERATIONS = 100_000; // sanity limit

private final Map<String, String> patternBank;
private final boolean namedCaptures;
private final Regex compiledExpression;
private final MatcherWatchdog matcherWatchdog;
private final List<GrokCaptureConfig> captureConfig;

public Grok(Map<String, String> patternBank, String grokPattern, Consumer<String> logCallBack) {
public Grok(PatternBank patternBank, String grokPattern, Consumer<String> logCallBack) {
this(patternBank, grokPattern, true, MatcherWatchdog.noop(), logCallBack);
}

public Grok(Map<String, String> patternBank, String grokPattern, MatcherWatchdog matcherWatchdog, Consumer<String> logCallBack) {
public Grok(PatternBank patternBank, String grokPattern, MatcherWatchdog matcherWatchdog, Consumer<String> logCallBack) {
this(patternBank, grokPattern, true, matcherWatchdog, logCallBack);
}

Grok(Map<String, String> patternBank, String grokPattern, boolean namedCaptures, Consumer<String> logCallBack) {
Grok(PatternBank patternBank, String grokPattern, boolean namedCaptures, Consumer<String> logCallBack) {
this(patternBank, grokPattern, namedCaptures, MatcherWatchdog.noop(), logCallBack);
}

private Grok(
Map<String, String> patternBank,
PatternBank patternBank,
String grokPattern,
boolean namedCaptures,
MatcherWatchdog matcherWatchdog,
Consumer<String> logCallBack
) {
this.patternBank = patternBank;
this.namedCaptures = namedCaptures;
this.matcherWatchdog = matcherWatchdog;

forbidCircularReferences();

String expression = toRegex(grokPattern);
String expression = toRegex(patternBank, grokPattern);
byte[] expressionBytes = expression.getBytes(StandardCharsets.UTF_8);
this.compiledExpression = new Regex(
expressionBytes,
Expand All @@ -100,78 +96,6 @@ private Grok(
this.captureConfig = List.copyOf(grokCaptureConfigs);
}

/**
* Checks whether patterns reference each other in a circular manner and if so fail with an exception
*
* In a pattern, anything between <code>%{</code> and <code>}</code> or <code>:</code> is considered
* a reference to another named pattern. This method will navigate to all these named patterns and
* check for a circular reference.
*/
private void forbidCircularReferences() {

// first ensure that the pattern bank contains no simple circular references (i.e., any pattern
// containing an immediate reference to itself) as those can cause the remainder of this algorithm
// to recurse infinitely
for (Map.Entry<String, String> entry : patternBank.entrySet()) {
if (patternReferencesItself(entry.getValue(), entry.getKey())) {
throw new IllegalArgumentException("circular reference in pattern [" + entry.getKey() + "][" + entry.getValue() + "]");
}
}

// next, recursively check any other pattern names referenced in each pattern
for (Map.Entry<String, String> entry : patternBank.entrySet()) {
String name = entry.getKey();
String pattern = entry.getValue();
innerForbidCircularReferences(name, new ArrayList<>(), pattern);
}
}

private void innerForbidCircularReferences(String patternName, List<String> path, String pattern) {
if (patternReferencesItself(pattern, patternName)) {
String message;
if (path.isEmpty()) {
message = "circular reference in pattern [" + patternName + "][" + pattern + "]";
} else {
message = "circular reference in pattern ["
+ path.remove(path.size() - 1)
+ "]["
+ pattern
+ "] back to pattern ["
+ patternName
+ "]";
// add rest of the path:
if (path.isEmpty() == false) {
message += " via patterns [" + String.join("=>", path) + "]";
}
}
throw new IllegalArgumentException(message);
}

// next check any other pattern names found in the pattern
for (int i = pattern.indexOf("%{"); i != -1; i = pattern.indexOf("%{", i + 1)) {
int begin = i + 2;
int bracketIndex = pattern.indexOf('}', begin);
int columnIndex = pattern.indexOf(':', begin);
int end;
if (bracketIndex != -1 && columnIndex == -1) {
end = bracketIndex;
} else if (columnIndex != -1 && bracketIndex == -1) {
end = columnIndex;
} else if (bracketIndex != -1 && columnIndex != -1) {
end = Math.min(bracketIndex, columnIndex);
} else {
throw new IllegalArgumentException("pattern [" + pattern + "] has circular references to other pattern definitions");
}
String otherPatternName = pattern.substring(begin, end);
path.add(otherPatternName);
innerForbidCircularReferences(patternName, path, patternBank.get(otherPatternName));
}
}

private static boolean patternReferencesItself(String pattern, String patternName) {
return pattern.contains("%{" + patternName + "}") || pattern.contains("%{" + patternName + ":");
}

private String groupMatch(String name, Region region, String pattern) {
int number = GROK_PATTERN_REGEX.nameToBackrefNumber(
name.getBytes(StandardCharsets.UTF_8),
Expand All @@ -192,7 +116,7 @@ private String groupMatch(String name, Region region, String pattern) {
*
* @return named regex expression
*/
protected String toRegex(String grokPattern) {
protected String toRegex(PatternBank patternBank, String grokPattern) {
StringBuilder res = new StringBuilder();
for (int i = 0; i < MAX_TO_REGEX_ITERATIONS; i++) {
byte[] grokPatternBytes = grokPattern.getBytes(StandardCharsets.UTF_8);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@
import java.io.InputStream;
import java.io.InputStreamReader;
import java.nio.charset.StandardCharsets;
import java.util.Collections;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
Expand All @@ -27,21 +26,21 @@ public class GrokBuiltinPatterns {
/**
* Patterns built in to the grok library.
*/
private static Map<String, String> LEGACY_PATTERNS;
private static Map<String, String> ECS_V1_PATTERNS;
private static PatternBank LEGACY_PATTERNS;
private static PatternBank ECS_V1_PATTERNS;

public static synchronized Map<String, String> legacyPatterns() {
public static synchronized PatternBank legacyPatterns() {
return get(false);
}

public static synchronized Map<String, String> ecsV1Patterns() {
public static synchronized PatternBank ecsV1Patterns() {
return get(true);
}

/**
* Load built-in patterns.
*/
public static synchronized Map<String, String> get(boolean ecsCompatibility) {
public static synchronized PatternBank get(boolean ecsCompatibility) {
if (ecsCompatibility) {
if (ECS_V1_PATTERNS == null) {
ECS_V1_PATTERNS = loadEcsPatterns();
Expand All @@ -55,7 +54,7 @@ public static synchronized Map<String, String> get(boolean ecsCompatibility) {
}
}

public static Map<String, String> get(String ecsCompatibility) {
public static PatternBank get(String ecsCompatibility) {
if (isValidEcsCompatibilityMode(ecsCompatibility)) {
return get(ECS_COMPATIBILITY_V1.equals(ecsCompatibility));
} else {
Expand All @@ -67,7 +66,7 @@ public static boolean isValidEcsCompatibilityMode(String ecsCompatibility) {
return ECS_COMPATIBILITY_MODES.contains(ecsCompatibility);
}

private static Map<String, String> loadLegacyPatterns() {
private static PatternBank loadLegacyPatterns() {
var patternNames = List.of(
"aws",
"bacula",
Expand All @@ -94,7 +93,7 @@ private static Map<String, String> loadLegacyPatterns() {
return loadPatternsFromDirectory(patternNames, "/patterns/legacy/");
}

private static Map<String, String> loadEcsPatterns() {
private static PatternBank loadEcsPatterns() {
var patternNames = List.of(
"aws",
"bacula",
Expand Down Expand Up @@ -122,7 +121,7 @@ private static Map<String, String> loadEcsPatterns() {
return loadPatternsFromDirectory(patternNames, "/patterns/ecs-v1/");
}

private static Map<String, String> loadPatternsFromDirectory(List<String> patternNames, String directory) {
private static PatternBank loadPatternsFromDirectory(List<String> patternNames, String directory) {
Map<String, String> builtinPatterns = new LinkedHashMap<>();
for (String pattern : patternNames) {
try {
Expand All @@ -133,7 +132,7 @@ private static Map<String, String> loadPatternsFromDirectory(List<String> patter
throw new RuntimeException("failed to load built-in patterns", e);
}
}
return Collections.unmodifiableMap(builtinPatterns);
return new PatternBank(builtinPatterns);
}

private static void loadPatternsFromFile(Map<String, String> patternBank, InputStream inputStream) throws IOException {
Expand Down
136 changes: 136 additions & 0 deletions libs/grok/src/main/java/org/elasticsearch/grok/PatternBank.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,136 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License
* 2.0 and the Server Side Public License, v 1; you may not use this file except
* in compliance with, at your election, the Elastic License 2.0 or the Server
* Side Public License, v 1.
*/

package org.elasticsearch.grok;

import java.util.ArrayList;
import java.util.Collections;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Objects;

public class PatternBank {

public static PatternBank EMPTY = new PatternBank(Map.of());

private final Map<String, String> bank;

public PatternBank(Map<String, String> bank) {
Objects.requireNonNull(bank, "bank must not be null");
forbidCircularReferences(bank);

// the bank reference should be unmodifiable, based on a defensive copy of the passed-in bank, and
// maintain the iteration order of the passed-in bank (assuming there was a meaningful order)
this.bank = Collections.unmodifiableMap(new LinkedHashMap<>(bank));
}

public String get(String patternName) {
return bank.get(patternName);
}

public Map<String, String> bank() {
return bank;
}

/**
* Extends a pattern bank with extra patterns, returning a new pattern bank.
* <p>
* The returned bank will be the same reference as the original pattern bank if the extra patterns map is null or empty.
*
* @param extraPatterns the patterns to extend this bank with (may be empty or null)
* @return the extended pattern bank
*/
public PatternBank extendWith(Map<String, String> extraPatterns) {
if (extraPatterns == null || extraPatterns.isEmpty()) {
return this;
}

var extendedBank = new LinkedHashMap<>(bank);
extendedBank.putAll(extraPatterns);
return new PatternBank(extendedBank);
}

/**
* Checks whether patterns reference each other in a circular manner and if so fail with an exception.
* <p>
* In a pattern, anything between <code>%{</code> and <code>}</code> or <code>:</code> is considered
* a reference to another named pattern. This method will navigate to all these named patterns and
* check for a circular reference.
*/
static void forbidCircularReferences(Map<String, String> bank) {
// first ensure that the pattern bank contains no simple circular references (i.e., any pattern
// containing an immediate reference to itself) as those can cause the remainder of this algorithm
// to recurse infinitely
for (Map.Entry<String, String> entry : bank.entrySet()) {
if (patternReferencesItself(entry.getValue(), entry.getKey())) {
throw new IllegalArgumentException("circular reference in pattern [" + entry.getKey() + "][" + entry.getValue() + "]");
}
}

// next, recursively check any other pattern names referenced in each pattern
for (Map.Entry<String, String> entry : bank.entrySet()) {
String name = entry.getKey();
String pattern = entry.getValue();
innerForbidCircularReferences(bank, name, new ArrayList<>(), pattern);
}
}

private static void innerForbidCircularReferences(Map<String, String> bank, String patternName, List<String> path, String pattern) {
if (patternReferencesItself(pattern, patternName)) {
String message;
if (path.isEmpty()) {
message = "circular reference in pattern [" + patternName + "][" + pattern + "]";
} else {
message = "circular reference in pattern ["
+ path.remove(path.size() - 1)
+ "]["
+ pattern
+ "] back to pattern ["
+ patternName
+ "]";
// add rest of the path:
if (path.isEmpty() == false) {
message += " via patterns [" + String.join("=>", path) + "]";
}
}
throw new IllegalArgumentException(message);
}

// next check any other pattern names found in the pattern
for (int i = pattern.indexOf("%{"); i != -1; i = pattern.indexOf("%{", i + 1)) {
int begin = i + 2;
int bracketIndex = pattern.indexOf('}', begin);
int columnIndex = pattern.indexOf(':', begin);
int end;
if (bracketIndex != -1 && columnIndex == -1) {
end = bracketIndex;
} else if (columnIndex != -1 && bracketIndex == -1) {
end = columnIndex;
} else if (bracketIndex != -1 && columnIndex != -1) {
end = Math.min(bracketIndex, columnIndex);
} else {
throw new IllegalArgumentException("pattern [" + pattern + "] has an invalid syntax");
}
String otherPatternName = pattern.substring(begin, end);
path.add(otherPatternName);
String otherPattern = bank.get(otherPatternName);
if (otherPattern == null) {
throw new IllegalArgumentException(
"pattern [" + patternName + "] is referencing a non-existent pattern [" + otherPatternName + "]"
);
}

innerForbidCircularReferences(bank, patternName, path, otherPattern);
}
}

private static boolean patternReferencesItself(String pattern, String patternName) {
return pattern.contains("%{" + patternName + "}") || pattern.contains("%{" + patternName + ":");
}
}

0 comments on commit b51951f

Please sign in to comment.