Add support for raw string literals; tweak lexer/ast.

1. Add raw strings. * `$$ ... $$` strings are now raw strings. * New syntax: `r'...'` and `r"..."` -- raw string literals. Raw strings do not process any escapes, meaning that `\` symbol acts just as any other one. 2. Tweak string literals. Regular strings no longer accept \xhh codes over 0x7F. This is modeled after Rust strings [1], quoting: "Higher values are not permitted because it is ambiguous whether they mean Unicode code points or byte values." 3. New AST nodes. EdgeQL and IR implementations got two new nodes: `StringConstant` and `RawStringConstant`. SQL has also a new node: `EscapedStringConstant` (compiles to an `E'...'` string). [1] https://doc.rust-lang.org/reference/tokens.html#character-escapes
edgedb · Oct 25, 2018 · 399cb02 · 399cb02
1 parent e793fd3
commit 399cb02
Show file tree

Hide file tree

Showing 21 changed files with 310 additions and 69 deletions.
diff --git a/edb/lang/edgeql/ast.py b/edb/lang/edgeql/ast.py
@@ -24,6 +24,7 @@
 from edb.lang.common import ast, parsing
 
 from . import functypes as ft
+from . import quote
 
 
 # Operators
@@ -235,6 +236,25 @@ class Constant(Expr):
     value: typing.Union[int, str, float, bool, bytes, decimal.Decimal]
 
 
+class StringConstant(Constant):
+    quote: str
+
+    @classmethod
+    def from_pystr(cls, s: str):
+        s = s.replace('\\', '\\\\')
+        value = quote.quote_literal(s)
+        return cls(value=value[1:-1], quote="'")
+
+
+class RawStringConstant(Constant):
+    quote: str
+
+    @classmethod
+    def from_pystr(cls, s: str):
+        value = quote.quote_literal(s)
+        return cls(value=value[1:-1], quote="'")
+
+
 class Parameter(Expr):
     name: str
 

diff --git a/edb/lang/edgeql/codegen.py b/edb/lang/edgeql/codegen.py
@@ -505,6 +505,15 @@ def visit_ShapeElement(self, node):
     def visit_Parameter(self, node):
         self.write(param_to_str(node.name))
 
+    def visit_StringConstant(self, node):
+        self.write(node.quote, node.value, node.quote)
+
+    def visit_RawStringConstant(self, node):
+        if node.quote.startswith('$'):
+            self.write(node.quote, node.value, node.quote)
+        else:
+            self.write('r', node.quote, node.value, node.quote)
+
     def visit_Constant(self, node):
         if isinstance(node.value, str):
             self.write(edgeql_quote.quote_literal(node.value))

diff --git a/edb/lang/edgeql/compiler/decompiler.py b/edb/lang/edgeql/compiler/decompiler.py
@@ -175,6 +175,12 @@ def visit_Parameter(self, node):
     def visit_Constant(self, node):
         return qlast.Constant(value=node.value)
 
+    def visit_StringConstant(self, node):
+        return qlast.StringConstant.from_pystr(node.value)
+
+    def visit_RawStringConstant(self, node):
+        return qlast.RawStringConstant.from_pystr(node.value)
+
     def visit_Array(self, node):
         return qlast.Array(elements=[
             self.visit(e) for e in node.elements

diff --git a/edb/lang/edgeql/compiler/expr.py b/edb/lang/edgeql/compiler/expr.py
@@ -161,11 +161,20 @@ def compile_Set(
 def compile_Constant(
         expr: qlast.Base, *, ctx: context.ContextLevel) -> irast.Base:
 
+    node_cls = irast.Constant
+
     if expr.value is None:
         ct = None
     else:
-        if isinstance(expr.value, str):
+        if isinstance(expr, qlast.StringConstant):
+            std_type = 'std::str'
+            node_cls = irast.StringConstant
+        elif isinstance(expr, qlast.RawStringConstant):
+            std_type = 'std::str'
+            node_cls = irast.RawStringConstant
+        elif isinstance(expr.value, str):
             std_type = 'std::str'
+            node_cls = irast.StringConstant
         elif isinstance(expr.value, decimal.Decimal):
             std_type = 'std::decimal'
         elif isinstance(expr.value, float):
@@ -187,7 +196,7 @@ def compile_Constant(
         ct = ctx.schema.get(std_type)
 
     return setgen.generated_set(
-        irast.Constant(value=expr.value, type=ct), ctx=ctx)
+        node_cls(value=expr.value, type=ct), ctx=ctx)
 
 
 @dispatch.compile.register(qlast.EmptyCollection)

diff --git a/edb/lang/edgeql/parser/grammar/ddl.py b/edb/lang/edgeql/parser/grammar/ddl.py
@@ -29,7 +29,7 @@
 
 from ...errors import EdgeQLSyntaxError
 
-from .expressions import Nonterm, BaseStringConstant
+from .expressions import Nonterm
 from . import tokens
 
 from .precedence import *  # NOQA
@@ -400,9 +400,11 @@ class OptDeltaTarget(Nonterm):
     def reduce_empty(self):
         self.val = None
 
-    def reduce_TO_AnyIdentifier_SCONST(self, *kids):
+    def reduce_TO_AnyIdentifier_BaseStringConstant(self, *kids):
         self.val = [kids[1], kids[2]]
 
+    def reduce_TO_AnyIdentifier_BaseRawStringConstant(self, *kids):
+        self.val = [kids[1], kids[2]]
 
 #
 # DELTAS
@@ -411,15 +413,17 @@ def reduce_TO_AnyIdentifier_SCONST(self, *kids):
 #
 # CREATE MIGRATION
 #
+
+
 class CreateDeltaStmt(Nonterm):
-    def _parse_schema_decl(self, tok: tokens.T_SCONST):
+    def _parse_schema_decl(self, tok):
         from edb.lang.common.exceptions import get_context
         from edb.lang.schema import parser
 
         ctx = tok.context
 
         try:
-            node = parser.parse(BaseStringConstant.parse_body(tok))
+            node = parser.parse(tok.val.value)
         except parsing.ParserError as err:
             context.rebase_context(
                 ctx, get_context(err, parsing.ParserContext))
@@ -1638,14 +1642,26 @@ def _parse_language(node):
 
 
 class FromFunction(Nonterm):
-    def reduce_FROM_Identifier_SCONST(self, *kids):
+    def reduce_FROM_Identifier_BaseStringConstant(self, *kids):
         lang = _parse_language(kids[1])
+        code = kids[2].val.value
+        self.val = qlast.FunctionCode(language=lang, code=code)
 
-        # we need literal value of the string
-        code = BaseStringConstant.parse_body(kids[2])
-
+    def reduce_FROM_Identifier_BaseRawStringConstant(self, *kids):
+        lang = _parse_language(kids[1])
+        code = kids[2].val.value
         self.val = qlast.FunctionCode(language=lang, code=code)
 
+    def reduce_FROM_Identifier_FUNCTION_BaseRawStringConstant(self, *kids):
+        lang = _parse_language(kids[1])
+        if lang != qlast.Language.SQL:
+            raise EdgeQLSyntaxError(
+                f'{lang} language is not supported in FROM FUNCTION clause',
+                context=kids[1].context) from None
+
+        self.val = qlast.FunctionCode(language=lang,
+                                      from_name=kids[3].val.value)
+
     def reduce_FROM_Identifier_FUNCTION_BaseStringConstant(self, *kids):
         lang = _parse_language(kids[1])
         if lang != qlast.Language.SQL:

diff --git a/edb/lang/edgeql/parser/grammar/expressions.py b/edb/lang/edgeql/parser/grammar/expressions.py
@@ -30,7 +30,7 @@
 
 from ...errors import EdgeQLSyntaxError
 
-from . import keywords, precedence, tokens
+from . import keywords, precedence, tokens, lexer
 
 from .precedence import *  # NOQA
 from .tokens import *  # NOQA
@@ -823,6 +823,7 @@ class Constant(Nonterm):
     # ArgConstant
     # | BaseNumberConstant
     # | BaseStringConstant
+    # | BaseRawStringConstant
     # | BaseBooleanConstant
     # | BaseBytesConstant
 
@@ -832,6 +833,9 @@ def reduce_ArgConstant(self, *kids):
     def reduce_BaseNumberConstant(self, *kids):
         self.val = kids[0].val
 
+    def reduce_BaseRawStringConstant(self, *kids):
+        self.val = kids[0].val
+
     def reduce_BaseStringConstant(self, *kids):
         self.val = kids[0].val
 
@@ -863,22 +867,27 @@ class BaseStringConstant(Nonterm):
     valid_str_re = re.compile(r'''
         ^
         (?P<Q>
-            (
-                ' | " | \$([A-Za-z\200-\377_][0-9]*)*\$
-            )
+            ' | "
         )
         (?P<body>
             (?:
+                \\\n |                  # line continuation
                 \n |                    # new line
                 \\\\ |                  # \\
                 \\['"] |                # \' or \"
-                \\x[0-9a-fA-F]{2} |     # \x00 -- hex code
-                \\u[0-9a-fA-F]{4} |     # \u0000
-                \\U[0-9a-fA-F]{8} |     # \U00000000
-                \\ (t | n | r) |        # \t, \n, or \r
+                                        #
+                \\x[0-7][0-9a-fA-F] |   # \xhh -- hex code, up to 0x7F
+                                        # (higher values are not permitted
+                                        # because it is ambiguous whether
+                                        # they mean Unicode code points or
+                                        # byte values.)
+                                        #
+                \\u[0-9a-fA-F]{4} |     # \uhhhh
+                \\U[0-9a-fA-F]{8} |     # \Uhhhhhhhh
+                \\ (?: t | n | r) |     # \t, \n, or \r
                 [^\\] |                 # anything except \
 
-                (?P<err_esc>            # capture any invalid \escape
+                (?P<err_esc>            # capture any invalid \escape sequence
                     \\x.{1,2} |
                     \\u.{1,4} |
                     \\U.{1,8} |
@@ -890,24 +899,60 @@ class BaseStringConstant(Nonterm):
         $
     ''', re.X)
 
-    @classmethod
-    def parse_body(cls, str_tok: tokens.T_SCONST):
-        match = cls.valid_str_re.match(str_tok.val)
+    def reduce_SCONST(self, str_tok):
+        match = self.valid_str_re.match(str_tok.val)
 
         if not match:
             raise EdgeQLSyntaxError(
-                f"invalid str literal", context=str_tok.context)
+                f"invalid string literal", context=str_tok.context)
         if match.group('err_esc'):
             raise EdgeQLSyntaxError(
-                f"invalid str literal: invalid escape sequence "
+                f"invalid string literal: invalid escape sequence "
                 f"'{match.group('err_esc')}'",
                 context=str_tok.context)
 
-        return match.group('body')
+        quote = match.group('Q')
+        val = match.group('body')
+
+        # handle line continuations
+        val = re.sub(r'\\\n', '', val)
 
-    def reduce_SCONST(self, str_tok):
-        val = self.parse_body(str_tok)
-        self.val = qlast.Constant(value=val)
+        self.val = qlast.StringConstant(value=val, quote=quote)
+
+
+class BaseRawStringConstant(Nonterm):
+
+    valid_rstr_re = re.compile(rf'''
+        ^
+        (?:
+            r
+        )?
+        (?P<Q>
+            (?:
+                (?<=r) (?: ' | ")
+            ) | (?:
+                (?<!r) (?: {lexer.re_dquote})
+            )
+        )
+        (?P<body>
+            (?:
+                \n | .
+            )*?
+        )
+        (?P=Q)
+        $
+    ''', re.X)
+
+    def reduce_RSCONST(self, str_tok):
+        match = self.valid_rstr_re.match(str_tok.val)
+        if not match:
+            raise EdgeQLSyntaxError(
+                f"invalid raw string literal", context=str_tok.context)
+
+        quote = match.group('Q')
+        val = match.group('body')
+
+        self.val = qlast.RawStringConstant(value=val, quote=quote)
 
 
 class BaseBytesConstant(Nonterm):
@@ -918,20 +963,18 @@ class BaseBytesConstant(Nonterm):
             b
         )
         (?P<BQ>
-            (
-                ' | "
-            )
+            ' | "
         )
         (?P<body>
             (
                 \n |                    # new line
                 \\\\ |                  # \\
                 \\['"] |                # \' or \"
-                \\x[0-9a-fA-F]{2} |     # \x00 -- hex code
-                \\ (t | n | r) |        # \t, \n, or \r
+                \\x[0-9a-fA-F]{2} |     # \xhh -- hex code
+                \\ (?: t | n | r) |     # \t, \n, or \r
                 [\x20-\x5b\x5d-\x7e] |  # match any printable ASCII, except '\'
 
-                (?P<err_esc>            # capture any invalid \escape
+                (?P<err_esc>            # capture any invalid \escape sequence
                     \\x.{1,2} |
                     \\.
                 ) |

diff --git a/edb/lang/edgeql/parser/grammar/lexer.py b/edb/lang/edgeql/parser/grammar/lexer.py
@@ -43,7 +43,7 @@ class EdgeQLLexer(lexer.Lexer):
     MERGE_TOKENS = {('NAMED', 'ONLY')}
 
     NL = 'NL'
-    MULTILINE_TOKENS = frozenset(('SCONST', 'BCONST'))
+    MULTILINE_TOKENS = frozenset(('SCONST', 'BCONST', 'RSCONST'))
     RE_FLAGS = re.X | re.M | re.I
 
     # Basic keywords
@@ -122,31 +122,50 @@ class EdgeQLLexer(lexer.Lexer):
                     b
                 )
                 (?P<BQ>
+                    ' | "
+                )
+                (?:
                     (
-                        ' | "
+                        \\\\ | \\['"] | \n | .
+                        # we'll validate escape codes in the parser
+                    )*?
+                )
+                (?P=BQ)
+             '''),
+
+        Rule(token='RSCONST',
+             next_state=STATE_KEEP,
+             regexp=rf'''
+                (?:
+                    r
+                )?
+                (?P<RQ>
+                    (?:
+                        (?<=r) (?: ' | ")
+                    ) | (?:
+                        (?<!r) (?: {re_dquote})
                     )
                 )
                 (?:
                     (
-                        (\\\\ | \\['"] | \n | .)*?
+                        \n | .
                         # we'll validate escape codes in the parser
                     )*?
                 )
-                (?P=BQ)
+                (?P=RQ)
              '''),
 
         Rule(token='SCONST',
              next_state=STATE_KEEP,
              regexp=rf'''
                 (?P<Q>
-                    (
-                        ' | " |
-                        {re_dquote}
-                    )
+                    ' | "
                 )
                 (?:
-                    (\\\\ | \\['"] | \n | .)*?
-                    # we'll validate escapes codes in the parser
+                    (
+                        \\\\ | \\['"] | \n | .
+                        # we'll validate escape codes in the parser
+                    )*?
                 )
                 (?P=Q)
              '''),