Skip to content

Commit

Permalink
Add support for raw string literals; tweak lexer/ast.
Browse files Browse the repository at this point in the history
1. Add raw strings.

* `$$ ... $$` strings are now raw strings.

* New syntax: `r'...'` and `r"..."` -- raw string literals.

Raw strings do not process any escapes, meaning that `\` symbol
acts just as any other one.

2. Tweak string literals.

Regular strings no longer accept \xhh codes over 0x7F.  This is
modeled after Rust strings [1], quoting:

"Higher values are not permitted because it is ambiguous whether
they mean Unicode code points or byte values."

3. New AST nodes.

EdgeQL and IR implementations got two new nodes: `StringConstant` and
`RawStringConstant`.  SQL has also a new node: `EscapedStringConstant`
(compiles to an `E'...'` string).

[1] https://doc.rust-lang.org/reference/tokens.html#character-escapes
  • Loading branch information
1st1 committed Oct 25, 2018
1 parent e793fd3 commit 399cb02
Show file tree
Hide file tree
Showing 21 changed files with 310 additions and 69 deletions.
20 changes: 20 additions & 0 deletions edb/lang/edgeql/ast.py
Expand Up @@ -24,6 +24,7 @@
from edb.lang.common import ast, parsing

from . import functypes as ft
from . import quote


# Operators
Expand Down Expand Up @@ -235,6 +236,25 @@ class Constant(Expr):
value: typing.Union[int, str, float, bool, bytes, decimal.Decimal]


class StringConstant(Constant):
quote: str

@classmethod
def from_pystr(cls, s: str):
s = s.replace('\\', '\\\\')
value = quote.quote_literal(s)
return cls(value=value[1:-1], quote="'")


class RawStringConstant(Constant):
quote: str

@classmethod
def from_pystr(cls, s: str):
value = quote.quote_literal(s)
return cls(value=value[1:-1], quote="'")


class Parameter(Expr):
name: str

Expand Down
9 changes: 9 additions & 0 deletions edb/lang/edgeql/codegen.py
Expand Up @@ -505,6 +505,15 @@ def visit_ShapeElement(self, node):
def visit_Parameter(self, node):
self.write(param_to_str(node.name))

def visit_StringConstant(self, node):
self.write(node.quote, node.value, node.quote)

def visit_RawStringConstant(self, node):
if node.quote.startswith('$'):
self.write(node.quote, node.value, node.quote)
else:
self.write('r', node.quote, node.value, node.quote)

def visit_Constant(self, node):
if isinstance(node.value, str):
self.write(edgeql_quote.quote_literal(node.value))
Expand Down
6 changes: 6 additions & 0 deletions edb/lang/edgeql/compiler/decompiler.py
Expand Up @@ -175,6 +175,12 @@ def visit_Parameter(self, node):
def visit_Constant(self, node):
return qlast.Constant(value=node.value)

def visit_StringConstant(self, node):
return qlast.StringConstant.from_pystr(node.value)

def visit_RawStringConstant(self, node):
return qlast.RawStringConstant.from_pystr(node.value)

def visit_Array(self, node):
return qlast.Array(elements=[
self.visit(e) for e in node.elements
Expand Down
13 changes: 11 additions & 2 deletions edb/lang/edgeql/compiler/expr.py
Expand Up @@ -161,11 +161,20 @@ def compile_Set(
def compile_Constant(
expr: qlast.Base, *, ctx: context.ContextLevel) -> irast.Base:

node_cls = irast.Constant

if expr.value is None:
ct = None
else:
if isinstance(expr.value, str):
if isinstance(expr, qlast.StringConstant):
std_type = 'std::str'
node_cls = irast.StringConstant
elif isinstance(expr, qlast.RawStringConstant):
std_type = 'std::str'
node_cls = irast.RawStringConstant
elif isinstance(expr.value, str):
std_type = 'std::str'
node_cls = irast.StringConstant
elif isinstance(expr.value, decimal.Decimal):
std_type = 'std::decimal'
elif isinstance(expr.value, float):
Expand All @@ -187,7 +196,7 @@ def compile_Constant(
ct = ctx.schema.get(std_type)

return setgen.generated_set(
irast.Constant(value=expr.value, type=ct), ctx=ctx)
node_cls(value=expr.value, type=ct), ctx=ctx)


@dispatch.compile.register(qlast.EmptyCollection)
Expand Down
32 changes: 24 additions & 8 deletions edb/lang/edgeql/parser/grammar/ddl.py
Expand Up @@ -29,7 +29,7 @@

from ...errors import EdgeQLSyntaxError

from .expressions import Nonterm, BaseStringConstant
from .expressions import Nonterm
from . import tokens

from .precedence import * # NOQA
Expand Down Expand Up @@ -400,9 +400,11 @@ class OptDeltaTarget(Nonterm):
def reduce_empty(self):
self.val = None

def reduce_TO_AnyIdentifier_SCONST(self, *kids):
def reduce_TO_AnyIdentifier_BaseStringConstant(self, *kids):
self.val = [kids[1], kids[2]]

def reduce_TO_AnyIdentifier_BaseRawStringConstant(self, *kids):
self.val = [kids[1], kids[2]]

#
# DELTAS
Expand All @@ -411,15 +413,17 @@ def reduce_TO_AnyIdentifier_SCONST(self, *kids):
#
# CREATE MIGRATION
#


class CreateDeltaStmt(Nonterm):
def _parse_schema_decl(self, tok: tokens.T_SCONST):
def _parse_schema_decl(self, tok):
from edb.lang.common.exceptions import get_context
from edb.lang.schema import parser

ctx = tok.context

try:
node = parser.parse(BaseStringConstant.parse_body(tok))
node = parser.parse(tok.val.value)
except parsing.ParserError as err:
context.rebase_context(
ctx, get_context(err, parsing.ParserContext))
Expand Down Expand Up @@ -1638,14 +1642,26 @@ def _parse_language(node):


class FromFunction(Nonterm):
def reduce_FROM_Identifier_SCONST(self, *kids):
def reduce_FROM_Identifier_BaseStringConstant(self, *kids):
lang = _parse_language(kids[1])
code = kids[2].val.value
self.val = qlast.FunctionCode(language=lang, code=code)

# we need literal value of the string
code = BaseStringConstant.parse_body(kids[2])

def reduce_FROM_Identifier_BaseRawStringConstant(self, *kids):
lang = _parse_language(kids[1])
code = kids[2].val.value
self.val = qlast.FunctionCode(language=lang, code=code)

def reduce_FROM_Identifier_FUNCTION_BaseRawStringConstant(self, *kids):
lang = _parse_language(kids[1])
if lang != qlast.Language.SQL:
raise EdgeQLSyntaxError(
f'{lang} language is not supported in FROM FUNCTION clause',
context=kids[1].context) from None

self.val = qlast.FunctionCode(language=lang,
from_name=kids[3].val.value)

def reduce_FROM_Identifier_FUNCTION_BaseStringConstant(self, *kids):
lang = _parse_language(kids[1])
if lang != qlast.Language.SQL:
Expand Down
91 changes: 67 additions & 24 deletions edb/lang/edgeql/parser/grammar/expressions.py
Expand Up @@ -30,7 +30,7 @@

from ...errors import EdgeQLSyntaxError

from . import keywords, precedence, tokens
from . import keywords, precedence, tokens, lexer

from .precedence import * # NOQA
from .tokens import * # NOQA
Expand Down Expand Up @@ -823,6 +823,7 @@ class Constant(Nonterm):
# ArgConstant
# | BaseNumberConstant
# | BaseStringConstant
# | BaseRawStringConstant
# | BaseBooleanConstant
# | BaseBytesConstant

Expand All @@ -832,6 +833,9 @@ def reduce_ArgConstant(self, *kids):
def reduce_BaseNumberConstant(self, *kids):
self.val = kids[0].val

def reduce_BaseRawStringConstant(self, *kids):
self.val = kids[0].val

def reduce_BaseStringConstant(self, *kids):
self.val = kids[0].val

Expand Down Expand Up @@ -863,22 +867,27 @@ class BaseStringConstant(Nonterm):
valid_str_re = re.compile(r'''
^
(?P<Q>
(
' | " | \$([A-Za-z\200-\377_][0-9]*)*\$
)
' | "
)
(?P<body>
(?:
\\\n | # line continuation
\n | # new line
\\\\ | # \\
\\['"] | # \' or \"
\\x[0-9a-fA-F]{2} | # \x00 -- hex code
\\u[0-9a-fA-F]{4} | # \u0000
\\U[0-9a-fA-F]{8} | # \U00000000
\\ (t | n | r) | # \t, \n, or \r
#
\\x[0-7][0-9a-fA-F] | # \xhh -- hex code, up to 0x7F
# (higher values are not permitted
# because it is ambiguous whether
# they mean Unicode code points or
# byte values.)
#
\\u[0-9a-fA-F]{4} | # \uhhhh
\\U[0-9a-fA-F]{8} | # \Uhhhhhhhh
\\ (?: t | n | r) | # \t, \n, or \r
[^\\] | # anything except \
(?P<err_esc> # capture any invalid \escape
(?P<err_esc> # capture any invalid \escape sequence
\\x.{1,2} |
\\u.{1,4} |
\\U.{1,8} |
Expand All @@ -890,24 +899,60 @@ class BaseStringConstant(Nonterm):
$
''', re.X)

@classmethod
def parse_body(cls, str_tok: tokens.T_SCONST):
match = cls.valid_str_re.match(str_tok.val)
def reduce_SCONST(self, str_tok):
match = self.valid_str_re.match(str_tok.val)

if not match:
raise EdgeQLSyntaxError(
f"invalid str literal", context=str_tok.context)
f"invalid string literal", context=str_tok.context)
if match.group('err_esc'):
raise EdgeQLSyntaxError(
f"invalid str literal: invalid escape sequence "
f"invalid string literal: invalid escape sequence "
f"'{match.group('err_esc')}'",
context=str_tok.context)

return match.group('body')
quote = match.group('Q')
val = match.group('body')

# handle line continuations
val = re.sub(r'\\\n', '', val)

def reduce_SCONST(self, str_tok):
val = self.parse_body(str_tok)
self.val = qlast.Constant(value=val)
self.val = qlast.StringConstant(value=val, quote=quote)


class BaseRawStringConstant(Nonterm):

valid_rstr_re = re.compile(rf'''
^
(?:
r
)?
(?P<Q>
(?:
(?<=r) (?: ' | ")
) | (?:
(?<!r) (?: {lexer.re_dquote})
)
)
(?P<body>
(?:
\n | .
)*?
)
(?P=Q)
$
''', re.X)

def reduce_RSCONST(self, str_tok):
match = self.valid_rstr_re.match(str_tok.val)
if not match:
raise EdgeQLSyntaxError(
f"invalid raw string literal", context=str_tok.context)

quote = match.group('Q')
val = match.group('body')

self.val = qlast.RawStringConstant(value=val, quote=quote)


class BaseBytesConstant(Nonterm):
Expand All @@ -918,20 +963,18 @@ class BaseBytesConstant(Nonterm):
b
)
(?P<BQ>
(
' | "
)
' | "
)
(?P<body>
(
\n | # new line
\\\\ | # \\
\\['"] | # \' or \"
\\x[0-9a-fA-F]{2} | # \x00 -- hex code
\\ (t | n | r) | # \t, \n, or \r
\\x[0-9a-fA-F]{2} | # \xhh -- hex code
\\ (?: t | n | r) | # \t, \n, or \r
[\x20-\x5b\x5d-\x7e] | # match any printable ASCII, except '\'
(?P<err_esc> # capture any invalid \escape
(?P<err_esc> # capture any invalid \escape sequence
\\x.{1,2} |
\\.
) |
Expand Down
39 changes: 29 additions & 10 deletions edb/lang/edgeql/parser/grammar/lexer.py
Expand Up @@ -43,7 +43,7 @@ class EdgeQLLexer(lexer.Lexer):
MERGE_TOKENS = {('NAMED', 'ONLY')}

NL = 'NL'
MULTILINE_TOKENS = frozenset(('SCONST', 'BCONST'))
MULTILINE_TOKENS = frozenset(('SCONST', 'BCONST', 'RSCONST'))
RE_FLAGS = re.X | re.M | re.I

# Basic keywords
Expand Down Expand Up @@ -122,31 +122,50 @@ class EdgeQLLexer(lexer.Lexer):
b
)
(?P<BQ>
' | "
)
(?:
(
' | "
\\\\ | \\['"] | \n | .
# we'll validate escape codes in the parser
)*?
)
(?P=BQ)
'''),

Rule(token='RSCONST',
next_state=STATE_KEEP,
regexp=rf'''
(?:
r
)?
(?P<RQ>
(?:
(?<=r) (?: ' | ")
) | (?:
(?<!r) (?: {re_dquote})
)
)
(?:
(
(\\\\ | \\['"] | \n | .)*?
\n | .
# we'll validate escape codes in the parser
)*?
)
(?P=BQ)
(?P=RQ)
'''),

Rule(token='SCONST',
next_state=STATE_KEEP,
regexp=rf'''
(?P<Q>
(
' | " |
{re_dquote}
)
' | "
)
(?:
(\\\\ | \\['"] | \n | .)*?
# we'll validate escapes codes in the parser
(
\\\\ | \\['"] | \n | .
# we'll validate escape codes in the parser
)*?
)
(?P=Q)
'''),
Expand Down

0 comments on commit 399cb02

Please sign in to comment.