-
Notifications
You must be signed in to change notification settings - Fork 554
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Use the antlr parser for parsing pg arrays.
Such as `PgArray#decodeUTF8Text` contained at least two known bugs related to parsing the `point` and `varchar` Postgres types, and was hard to extend and reason about, we decided to go with the Antlr parser to parse/decode pg arrays. This change fixes the above mentioned issues with insertion of the `varchar` and `point` type arrays and introduces a new approach to parse the pg arrays.
- Loading branch information
1 parent
3091e26
commit 4154a6f
Showing
16 changed files
with
650 additions
and
158 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -55,4 +55,4 @@ out/ | |
|
||
# Misc | ||
sandbox/crate/data/ | ||
sql-parser/src/main/java/io/crate/sql/parser/antlr/ | ||
**/antlr/v4 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,54 @@ | ||
apply from: "$rootDir/gradle/javaModule.gradle" | ||
|
||
archivesBaseName = 'crate-pgwire' | ||
|
||
ext.antlr = [ | ||
source : "src/main/antlr", | ||
output : "src/main/java/io/crate/protocols/postgres/antlr/v4", | ||
package: 'io.crate.protocols.postgres.antlr.v4' | ||
] | ||
|
||
configurations { | ||
antlr4 | ||
} | ||
|
||
dependencies { | ||
antlr4 "org.antlr:antlr4:${versions.antlr}" | ||
|
||
implementation "org.antlr:antlr4-runtime:${versions.antlr}" | ||
implementation "com.carrotsearch:hppc:${versions.carrotsearch_hppc}" | ||
|
||
testImplementation "org.hamcrest:hamcrest:${versions.hamcrest}" | ||
testImplementation "junit:junit:${versions.junit}" | ||
testImplementation "com.fasterxml.jackson.core:jackson-databind:${versions.jacksondatabind}" | ||
} | ||
|
||
task antlrOutputDir { | ||
doLast { | ||
mkdir(antlr.output) | ||
} | ||
} | ||
|
||
task generateGrammarSource(dependsOn: antlrOutputDir, type: JavaExec) { | ||
inputs.files(fileTree(antlr.source)) | ||
outputs.dir file(antlr.output) | ||
|
||
def grammars = fileTree(antlr.source).include('**/*.g4') | ||
|
||
main = 'org.antlr.v4.Tool' | ||
classpath = configurations.antlr4 | ||
args = [ | ||
"-o", "${antlr.output}", | ||
"-visitor", "-no-listener", | ||
"-package", antlr.package, | ||
grammars.files | ||
].flatten() | ||
} | ||
|
||
tasks.withType(JavaCompile) { | ||
it.dependsOn('generateGrammarSource') | ||
} | ||
|
||
clean { | ||
delete antlr.output | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,71 @@ | ||
grammar PgArray; | ||
|
||
/** | ||
The grammar is used to parse PG array text | ||
representations. E.g.: | ||
numeric PG arrays: | ||
{10, NULL, NULL, 20, 30} | ||
{"10", NULL, NULL, "20", "30"} | ||
multi-dimentsional PG arrays: | ||
{{"10", "20"}, {"30", NULL, "40"}} | ||
json PG arrays: | ||
{"{\"x\": 10}", "{\"y\": 20}"} | ||
{\"{\\\"x\\\": 10}\", \"{\\\"y\\\": 20}\"} | ||
*/ | ||
|
||
array | ||
: '{' item (',' item)* '}' | ||
| '{' '}' | ||
; | ||
|
||
item | ||
: STRING #value | ||
| NUMBER #value | ||
| bool #value | ||
| array #noop | ||
| NULL #null | ||
; | ||
|
||
bool | ||
: 'true' | ||
| 'false' | ||
; | ||
|
||
STRING | ||
: '"' (ESC | ~["\\])* '"' | ||
; | ||
NUMBER | ||
: '-'? DIGIT '.' DIGIT EXP? | ||
| '-'? DIGIT EXP | ||
| '-'? DIGIT | ||
; | ||
NULL: 'NULL'; | ||
fragment ESC | ||
: '\\' (["\\/bfnrt] | UNICODE) | ||
; | ||
fragment UNICODE | ||
: 'u' HEX HEX HEX HEX | ||
; | ||
fragment HEX | ||
: [0-9a-fA-F] | ||
; | ||
fragment DIGIT | ||
: '0' | [1-9] [0-9]* | ||
; | ||
fragment EXP | ||
: [Ee] [+\-]? DIGIT | ||
; | ||
WS | ||
: [ \t\n\r] + -> skip | ||
; |
85 changes: 85 additions & 0 deletions
85
pgwire/src/main/java/io/crate/protocols/postgres/parser/PgArrayASTVisitor.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,85 @@ | ||
/* | ||
* Licensed to Crate under one or more contributor license agreements. | ||
* See the NOTICE file distributed with this work for additional | ||
* information regarding copyright ownership. Crate licenses this file | ||
* to you under the Apache License, Version 2.0 (the "License"); you may | ||
* not use this file except in compliance with the License. You may | ||
* obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or | ||
* implied. See the License for the specific language governing | ||
* permissions and limitations under the License. | ||
* | ||
* However, if you have executed another commercial license agreement | ||
* with Crate these terms will supersede the license and you may use the | ||
* software solely pursuant to the terms of the relevant commercial | ||
* agreement. | ||
*/ | ||
|
||
package io.crate.protocols.postgres.parser; | ||
|
||
import com.carrotsearch.hppc.ByteArrayList; | ||
import io.crate.protocols.postgres.antlr.v4.PgArrayBaseVisitor; | ||
import io.crate.protocols.postgres.antlr.v4.PgArrayParser; | ||
|
||
import java.util.ArrayList; | ||
import java.util.function.Function; | ||
|
||
import static java.nio.charset.StandardCharsets.UTF_8; | ||
|
||
class PgArrayASTVisitor extends PgArrayBaseVisitor<Object> { | ||
|
||
private final Function<byte[], Object> convert; | ||
|
||
PgArrayASTVisitor(Function<byte[], Object> convert) { | ||
this.convert = convert; | ||
} | ||
|
||
@Override | ||
public Object visitArray(PgArrayParser.ArrayContext ctx) { | ||
ArrayList<Object> array = new ArrayList<>(); | ||
for (var value : ctx.item()) { | ||
array.add(visit(value)); | ||
} | ||
return array; | ||
} | ||
|
||
@Override | ||
public Object visitValue(PgArrayParser.ValueContext ctx) { | ||
return convert.apply(processArrayItem(ctx.getText().getBytes(UTF_8))); | ||
} | ||
|
||
@Override | ||
public Object visitNull(PgArrayParser.NullContext ctx) { | ||
return null; | ||
} | ||
|
||
/** | ||
* Processes an array's item by skipping escape characters | ||
* and unquoting the item if it is needed. | ||
* | ||
* @param bytes {@code byte[]} that represent an array's item. | ||
*/ | ||
private static byte[] processArrayItem(byte[] bytes) { | ||
var itemBytes = new ByteArrayList(); | ||
int start = 0, end = bytes.length - 1; | ||
if (bytes[start] == '"') { | ||
start += 1; | ||
end -= 1; | ||
} | ||
|
||
for (int i = start; i <= end; i++) { | ||
if (i < end && (char) bytes[i] == '\\' | ||
&& ((char) bytes[i + 1] == '\\' || (char) bytes[i + 1] == '\"')) { | ||
i++; | ||
} | ||
itemBytes.add(bytes[i]); | ||
} | ||
|
||
return itemBytes.toArray(); | ||
} | ||
} |
101 changes: 101 additions & 0 deletions
101
pgwire/src/main/java/io/crate/protocols/postgres/parser/PgArrayParser.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,101 @@ | ||
/* | ||
* Licensed to Crate under one or more contributor license agreements. | ||
* See the NOTICE file distributed with this work for additional | ||
* information regarding copyright ownership. Crate licenses this file | ||
* to you under the Apache License, Version 2.0 (the "License"); you may | ||
* not use this file except in compliance with the License. You may | ||
* obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or | ||
* implied. See the License for the specific language governing | ||
* permissions and limitations under the License. | ||
* | ||
* However, if you have executed another commercial license agreement | ||
* with Crate these terms will supersede the license and you may use the | ||
* software solely pursuant to the terms of the relevant commercial | ||
* agreement. | ||
*/ | ||
|
||
package io.crate.protocols.postgres.parser; | ||
|
||
import io.crate.protocols.postgres.antlr.v4.PgArrayLexer; | ||
import org.antlr.v4.runtime.BaseErrorListener; | ||
import org.antlr.v4.runtime.CharStreams; | ||
import org.antlr.v4.runtime.CommonTokenStream; | ||
import org.antlr.v4.runtime.ParserRuleContext; | ||
import org.antlr.v4.runtime.RecognitionException; | ||
import org.antlr.v4.runtime.Recognizer; | ||
import org.antlr.v4.runtime.atn.PredictionMode; | ||
import org.antlr.v4.runtime.misc.ParseCancellationException; | ||
|
||
import java.io.ByteArrayInputStream; | ||
import java.io.IOException; | ||
import java.io.InputStream; | ||
import java.nio.charset.StandardCharsets; | ||
import java.util.function.Function; | ||
|
||
public class PgArrayParser { | ||
|
||
private static final BaseErrorListener ERROR_LISTENER = new BaseErrorListener() { | ||
@Override | ||
public void syntaxError(Recognizer<?, ?> recognizer, | ||
Object offendingSymbol, | ||
int line, | ||
int charPositionInLine, | ||
String message, | ||
RecognitionException e) { | ||
throw new PgArrayParsingException(message, e, line, charPositionInLine); | ||
} | ||
}; | ||
|
||
private static final PgArrayParser INSTANCE = new PgArrayParser(); | ||
|
||
public static Object parse(byte[] bytes, Function<byte[], Object> convert) { | ||
return INSTANCE.invokeParser( | ||
new ByteArrayInputStream(bytes), | ||
io.crate.protocols.postgres.antlr.v4.PgArrayParser::array, | ||
convert); | ||
} | ||
|
||
private Object invokeParser(InputStream inputStream, | ||
Function<io.crate.protocols.postgres.antlr.v4.PgArrayParser, ParserRuleContext> parseFunction, | ||
Function<byte[], Object> convert) { | ||
try { | ||
var lexer = new PgArrayLexer(CharStreams.fromStream( | ||
inputStream, | ||
StandardCharsets.UTF_8)); | ||
var tokenStream = new CommonTokenStream(lexer); | ||
var parser = new io.crate.protocols.postgres.antlr.v4.PgArrayParser(tokenStream); | ||
|
||
lexer.removeErrorListeners(); | ||
lexer.addErrorListener(ERROR_LISTENER); | ||
|
||
parser.removeErrorListeners(); | ||
parser.addErrorListener(ERROR_LISTENER); | ||
|
||
ParserRuleContext tree; | ||
try { | ||
// first, try parsing with potentially faster SLL mode | ||
parser.getInterpreter().setPredictionMode(PredictionMode.SLL); | ||
tree = parseFunction.apply(parser); | ||
} catch (ParseCancellationException ex) { | ||
// if we fail, parse with LL mode | ||
tokenStream.seek(0); // rewind input stream | ||
parser.reset(); | ||
|
||
parser.getInterpreter().setPredictionMode(PredictionMode.LL); | ||
tree = parseFunction.apply(parser); | ||
} | ||
|
||
return new PgArrayASTVisitor(convert).visit(tree); | ||
} catch (StackOverflowError e) { | ||
throw new PgArrayParsingException("stack overflow while parsing: " + e.getLocalizedMessage()); | ||
} catch (IOException e) { | ||
return new IllegalArgumentException(e); | ||
} | ||
} | ||
} |
Oops, something went wrong.