Skip to content

Commit

Permalink
Extract capture config from grok patterns up front (backport of #62706)…
Browse files Browse the repository at this point in the history
… (#62785)

This extracts the configuration for extracting values from a groked
string when building the grok expression to do two things:
1. Create a method exposing that configuration on `Grok` itself which
   will be used grok `grok` flavored runtime fields.
2. Marginally speed up extracting grok values by skipping a little
   string manipulation.
  • Loading branch information
nik9000 committed Sep 22, 2020
1 parent fa13585 commit 7ffea46
Show file tree
Hide file tree
Showing 5 changed files with 289 additions and 82 deletions.
37 changes: 23 additions & 14 deletions libs/grok/src/main/java/org/elasticsearch/grok/Grok.java
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,8 @@
import java.util.Map;
import java.util.function.Consumer;

import static java.util.Collections.unmodifiableList;

public final class Grok {
/**
* Patterns built in to the grok library.
Expand Down Expand Up @@ -72,6 +74,7 @@ public final class Grok {
private final boolean namedCaptures;
private final Regex compiledExpression;
private final MatcherWatchdog matcherWatchdog;
private final List<GrokCaptureConfig> captureConfig;

public Grok(Map<String, String> patternBank, String grokPattern, Consumer<String> logCallBack) {
this(patternBank, grokPattern, true, MatcherWatchdog.noop(), logCallBack);
Expand Down Expand Up @@ -101,6 +104,12 @@ private Grok(Map<String, String> patternBank, String grokPattern, boolean namedC
byte[] expressionBytes = expression.getBytes(StandardCharsets.UTF_8);
this.compiledExpression = new Regex(expressionBytes, 0, expressionBytes.length, Option.DEFAULT, UTF8Encoding.INSTANCE,
message -> logCallBack.accept(message));

List<GrokCaptureConfig> captureConfig = new ArrayList<>();
for (Iterator<NameEntry> entry = compiledExpression.namedBackrefIterator(); entry.hasNext();) {
captureConfig.add(new GrokCaptureConfig(entry.next()));
}
this.captureConfig = unmodifiableList(captureConfig);
}

/**
Expand Down Expand Up @@ -146,7 +155,7 @@ private void forbidCircularReferences(String patternName, List<String> path, Str
}
}

public String groupMatch(String name, Region region, String pattern) {
private String groupMatch(String name, Region region, String pattern) {
try {
int number = GROK_PATTERN_REGEX.nameToBackrefNumber(name.getBytes(StandardCharsets.UTF_8), 0,
name.getBytes(StandardCharsets.UTF_8).length, region);
Expand All @@ -165,7 +174,7 @@ public String groupMatch(String name, Region region, String pattern) {
*
* @return named regex expression
*/
public String toRegex(String grokPattern) {
protected String toRegex(String grokPattern) {
StringBuilder res = new StringBuilder();
for (int i = 0; i < MAX_TO_REGEX_ITERATIONS; i++) {
byte[] grokPatternBytes = grokPattern.getBytes(StandardCharsets.UTF_8);
Expand Down Expand Up @@ -255,19 +264,12 @@ public Map<String, Object> captures(String text) {
// TODO: I think we should throw an error here?
return null;
} else if (compiledExpression.numberOfNames() > 0) {
Map<String, Object> fields = new HashMap<>();
Map<String, Object> fields = new HashMap<>(captureConfig.size());
Region region = matcher.getEagerRegion();
for (Iterator<NameEntry> entry = compiledExpression.namedBackrefIterator(); entry.hasNext();) {
NameEntry e = entry.next();
String groupName = new String(e.name, e.nameP, e.nameEnd - e.nameP, StandardCharsets.UTF_8);
for (int number : e.getBackRefs()) {
if (region.beg[number] >= 0) {
String matchValue = new String(textAsBytes, region.beg[number], region.end[number] - region.beg[number],
StandardCharsets.UTF_8);
GrokMatchGroup match = new GrokMatchGroup(groupName, matchValue);
fields.put(match.getName(), match.getValue());
break;
}
for (GrokCaptureConfig config: captureConfig) {
Object v = config.extract(textAsBytes, region);
if (v != null) {
fields.put(config.name(), v);
}
}
return fields;
Expand All @@ -276,6 +278,13 @@ public Map<String, Object> captures(String text) {
}
}

/**
* The list of values that this {@linkplain Grok} can capture.
*/
public List<GrokCaptureConfig> captureConfig() {
return captureConfig;
}

/**
* Load built-in patterns.
*/
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/

package org.elasticsearch.grok;

import org.joni.NameEntry;
import org.joni.Region;

import java.nio.charset.StandardCharsets;

/**
* Configuration for a value that {@link Grok} can capture.
*/
public final class GrokCaptureConfig {
private final String name;
private final GrokCaptureType type;
private final int[] backRefs;

GrokCaptureConfig(NameEntry nameEntry) {
String groupName = new String(nameEntry.name, nameEntry.nameP, nameEntry.nameEnd - nameEntry.nameP, StandardCharsets.UTF_8);
String[] parts = groupName.split(":");
name = parts.length >= 2 ? parts[1] : parts[0];
type = parts.length == 3 ? GrokCaptureType.fromString(parts[2]) : GrokCaptureType.STRING;
this.backRefs = nameEntry.getBackRefs();
}

/**
* The name defined for the field in the pattern.
*/
public String name() {
return name;
}

/**
* The type defined for the field in the pattern.
*/
public GrokCaptureType type() {
return type;
}

Object extract(byte[] textAsBytes, Region region) {
for (int number : backRefs) {
if (region.beg[number] >= 0) {
String matchValue = new String(textAsBytes, region.beg[number], region.end[number] - region.beg[number],
StandardCharsets.UTF_8);
return type.parse(matchValue);
}
}
return null;
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/

package org.elasticsearch.grok;

/**
* The type defined for the field in the pattern.
*/
public enum GrokCaptureType {
STRING {
@Override
protected Object parseValue(String str) {
return str;
}
},
INTEGER {
@Override
protected Object parseValue(String str) {
return Integer.parseInt(str);
}
},
LONG {
@Override
protected Object parseValue(String str) {
return Long.parseLong(str);
}
},
DOUBLE {
@Override
protected Object parseValue(String str) {
return Double.parseDouble(str);
}
},
FLOAT {
@Override
protected Object parseValue(String str) {
return Float.parseFloat(str);
}
},
BOOLEAN {
@Override
protected Object parseValue(String str) {
return Boolean.parseBoolean(str);
}
};

final Object parse(String str) {
if (str == null) {
return null;
}
return parseValue(str);
}

protected abstract Object parseValue(String str);

static GrokCaptureType fromString(String str) {
switch (str) {
case "string":
return STRING;
case "int":
return INTEGER;
case "long":
return LONG;
case "double":
return DOUBLE;
case "float":
return FLOAT;
case "boolean":
return BOOLEAN;
default:
return STRING;
}
}
}
68 changes: 0 additions & 68 deletions libs/grok/src/main/java/org/elasticsearch/grok/GrokMatchGroup.java

This file was deleted.

0 comments on commit 7ffea46

Please sign in to comment.