From 8a80a6651fda70972fa391cdb898a8e1de715829 Mon Sep 17 00:00:00 2001
From: Mark Sujew <mark.sujew@typefox.io>
Date: Sat, 11 Feb 2023 15:06:16 +0000
Subject: [PATCH 1/3] Add guide for multi mode lexing

---
 hugo/content/guides/multi-mode-lexing.md | 116 +++++++++++++++++++++++
 1 file changed, 116 insertions(+)
 create mode 100644 hugo/content/guides/multi-mode-lexing.md

diff --git a/hugo/content/guides/multi-mode-lexing.md b/hugo/content/guides/multi-mode-lexing.md
new file mode 100644
index 00000000..a0e71368
--- /dev/null
+++ b/hugo/content/guides/multi-mode-lexing.md
@@ -0,0 +1,116 @@
+---
+title: "Multi-Mode Lexing"
+weight: 400
+---
+
+Many modern programming languages such as [JavaScript](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Template_literals) or [C#](https://learn.microsoft.com/en-us/dotnet/csharp/language-reference/tokens/interpolated) support template literals.
+They are a way to easily concatenate or interpolate string values while maintaining great code readability.
+This guide will show you how to support template literals in Langium.
+
+```antlr
+TemplateLiteral:
+    // Either just the full content
+    content+=TemplateContent | 
+    // Or template string parts with expressions in between
+    (
+        content+=TemplateContentStart 
+        content+=Expression? 
+        (
+            content+=TemplateContentMiddle 
+            content+=Expression?
+        )* 
+        content+=TemplateContentEnd
+    );
+
+TemplateContent returns TextLiteral:
+    value=RICH_TEXT;
+
+TemplateContentStart returns TextLiteral:
+    value=RICH_TEXT_START;
+
+TemplateContentMiddle returns TextLiteral:
+    value=RICH_TEXT_INBETWEEN;
+
+TemplateContentEnd returns TextLiteral:
+    value=RICH_TEXT_END;
+
+terminal RICH_TEXT:
+    '`' IN_RICH_TEXT* '`';
+
+terminal RICH_TEXT_START:
+    '`' IN_RICH_TEXT* '{';
+
+terminal RICH_TEXT_INBETWEEN:
+    '}' IN_RICH_TEXT* '{';
+
+terminal RICH_TEXT_END:
+    '}' IN_RICH_TEXT* '`';
+
+terminal fragment IN_RICH_TEXT:
+    /[^{`]|{{|``/;
+```
+
+```ts
+import { DefaultTokenBuilder, Grammar, isTokenTypeArray, Keyword, TerminalRule } from "langium";
+import { IMultiModeLexerDefinition, TokenType, TokenVocabulary } from "chevrotain";
+
+const REGULAR_MODE  = 'regular_mode';
+const TEMPLATE_MODE = 'template_mode';
+
+export class CustomTokenBuilder extends DefaultTokenBuilder {
+
+    override buildTokens(grammar: Grammar, options?: { caseInsensitive?: boolean }): TokenVocabulary {
+        const tokenTypes = super.buildTokens(grammar, options);
+
+        if(isTokenTypeArray(tokenTypes)) {
+            // Regular mode just drops rich text middle & end
+            const regularModeTokens = tokenTypes
+                .filter(token => !['RICH_TEXT_INBETWEEN','RICH_TEXT_END'].includes(token.name));
+            // Template mode needs to exclude the '}' keyword, which causes confusion while lexing
+            const templateModeTokens = tokenTypes
+                .filter(token => !['}'].includes(token.name));
+
+            const multiModeLexerDef: IMultiModeLexerDefinition = {
+                modes: {
+                    [REGULAR_MODE]: regularModeTokens,
+                    [TEMPLATE_MODE]: templateModeTokens
+                },
+                defaultMode: REGULAR_MODE
+            };
+            return multiModeLexerDef;
+        } else {
+            throw new Error('Invalid token vocabulary received from DefaultTokenBuilder!');
+        }
+    }
+
+    protected override buildKeywordToken(
+        keyword: Keyword,
+        terminalTokens: TokenType[],
+        caseInsensitive: boolean
+    ): TokenType {
+        let tokenType = super.buildKeywordToken(keyword, terminalTokens, caseInsensitive);
+        
+        if (tokenType.name === '}') {
+            // The default } token will use [RICH_TEXT_INBETWEEN, RICH_TEXT_END] as longer alts
+            // We need to delete the LONGER_ALT, they are not valid for the regular lexer mode
+            delete tokenType.LONGER_ALT;
+        }
+
+        return tokenType;
+    }
+
+    protected override buildTerminalToken(terminal: TerminalRule): TokenType {
+        let tokenType = super.buildTerminalToken(terminal);
+
+        // Update token types to enter & exit template mode
+        if(tokenType.name === 'RICH_TEXT_START') {
+            tokenType.PUSH_MODE = TEMPLATE_MODE;
+        } else if(tokenType.name === 'RICH_TEXT_END') {
+            tokenType.POP_MODE = true;
+        }
+
+        return tokenType;
+    }
+
+}
+```

From 746f1ed65b686bb57eeb719905037acb73f1fa0e Mon Sep 17 00:00:00 2001
From: Mark Sujew <mark.sujew@typefox.io>
Date: Mon, 13 Feb 2023 00:15:52 +0000
Subject: [PATCH 2/3] Finish guide

---
 hugo/content/guides/multi-mode-lexing.md | 137 ++++++++++++++++-------
 1 file changed, 98 insertions(+), 39 deletions(-)

diff --git a/hugo/content/guides/multi-mode-lexing.md b/hugo/content/guides/multi-mode-lexing.md
index a0e71368..203f2131 100644
--- a/hugo/content/guides/multi-mode-lexing.md
+++ b/hugo/content/guides/multi-mode-lexing.md
@@ -7,51 +7,81 @@ Many modern programming languages such as [JavaScript](https://developer.mozilla
 They are a way to easily concatenate or interpolate string values while maintaining great code readability.
 This guide will show you how to support template literals in Langium.
 
+For this specific example, our template literal starts and ends using backticks `` ` ``  and are interupted by expressions that are wrapped in curly braces `{}`.
+So in our example, usage of template literals might look something like this:
+
+```js
+println(`hello {name}!`);
+```
+
+Conceptually, template strings work by reading a start terminal which starts with `` ` `` and ends with `{`, 
+followed by an expression and then an end terminal which is effectively just the start terminal in reverse using `}` and `` ` ``.
+Since we don't want to restrict users to only a single expression in their template literals, we also need a "middle" terminal reading from `}` to `{`.
+Of course, there's also the option that a user only uses a template literal without any expressions in there.
+So we additionally need a "full" terminal that reads from the start of the literal all the way to the end in one go.
+
+To achieve this, we will define a `TemplateLiteral` parser rule and a few terminals.
+These terminals will adhere to the requirements that we just defined.
+To make it a bit easier to read and maintain, we also define a special terminal fragment that we can reuse in all our terminal definitions:
+
 ```antlr
 TemplateLiteral:
     // Either just the full content
-    content+=TemplateContent | 
-    // Or template string parts with expressions in between
+    content+=TEMPLATE_LITERAL_FULL |
+    // Or template literal parts with expressions in between
     (
-        content+=TemplateContentStart 
-        content+=Expression? 
+        content+=TEMPLATE_LITERAL_START 
+        content+=Expression?
         (
-            content+=TemplateContentMiddle 
+            content+=TEMPLATE_LITERAL_MIDDLE
             content+=Expression?
-        )* 
-        content+=TemplateContentEnd
-    );
+        )*
+        content+=TEMPLATE_LITERAL_END
+    )
+;
 
-TemplateContent returns TextLiteral:
-    value=RICH_TEXT;
+terminal TEMPLATE_LITERAL_FULL:
+    '`' IN_TEMPLATE_LITERAL* '`';
 
-TemplateContentStart returns TextLiteral:
-    value=RICH_TEXT_START;
+terminal TEMPLATE_LITERAL_START:
+    '`' IN_TEMPLATE_LITERAL* '{';
 
-TemplateContentMiddle returns TextLiteral:
-    value=RICH_TEXT_INBETWEEN;
+terminal TEMPLATE_LITERAL_MIDDLE:
+    '}' IN_TEMPLATE_LITERAL* '{';
 
-TemplateContentEnd returns TextLiteral:
-    value=RICH_TEXT_END;
+terminal TEMPLATE_LITERAL_END:
+    '}' IN_TEMPLATE_LITERAL* '`';
 
-terminal RICH_TEXT:
-    '`' IN_RICH_TEXT* '`';
+// '{{' is handled in a special way so we can escape normal '{' characters
+// '``' is doing the same for the '`' character
+terminal fragment IN_TEMPLATE_LITERAL:
+    /[^{`]|{{|``/;
+```
 
-terminal RICH_TEXT_START:
-    '`' IN_RICH_TEXT* '{';
+If we go ahead and start parsing files with these changes, most things should work as expected.
+However, depending on the structure of your existing grammar, some of these new terminals might be in conflict with existing terminals of your language.
+For example, if your language supports block statements, chaining multiple blocks together will make this issue apparent:
 
-terminal RICH_TEXT_INBETWEEN:
-    '}' IN_RICH_TEXT* '{';
+```js
+{
+    console.log('hi');
+}
+{
+    console.log('hello');
+}
+```
 
-terminal RICH_TEXT_END:
-    '}' IN_RICH_TEXT* '`';
+The `} ... {` block in this example won't be parsed as separate `}` and `{` tokens, but instead as a single `TEMPLATE_LITERAL_MIDDLE` token, resulting in a parser error due to the unexpected token.
+This doesn't make a lot of sense, since we aren't in the middle of a template literal at this point anyway.
+However, our lexer doesn't know yet that the `TEMPLATE_LITERAL_MIDDLE` and `TEMPLATE_LITERAL_END` terminals are only allowed to show up within a `TemplateLiteral` rule.
+To rectify this, we will need to make use of lexer modes. They will give us the necessary context to know whether we're inside a template literal or outside of it.
+Depending on the current selected mode, we can lex different terminals. In our case, we want to exclude the `TEMPLATE_LITERAL_MIDDLE` and `TEMPLATE_LITERAL_END` terminals.
 
-terminal fragment IN_RICH_TEXT:
-    /[^{`]|{{|``/;
-```
+The following implementation of a `TokenBuilder` will do the job for us. It creates two lexing modes, which are almost identical except for the `TEMPLATE_LITERAL_MIDDLE` and `TEMPLATE_LITERAL_END` terminals.
+We will also need to make sure that the modes are switched based on the `TEMPLATE_LITERAL_START` and `TEMPLATE_LITERAL_END` terminals. We use `PUSH_MODE` and `POP_MODE` for this.
 
 ```ts
-import { DefaultTokenBuilder, Grammar, isTokenTypeArray, Keyword, TerminalRule } from "langium";
+import { DefaultTokenBuilder, isTokenTypeArray, GrammarAST } from "langium";
 import { IMultiModeLexerDefinition, TokenType, TokenVocabulary } from "chevrotain";
 
 const REGULAR_MODE  = 'regular_mode';
@@ -59,14 +89,14 @@ const TEMPLATE_MODE = 'template_mode';
 
 export class CustomTokenBuilder extends DefaultTokenBuilder {
 
-    override buildTokens(grammar: Grammar, options?: { caseInsensitive?: boolean }): TokenVocabulary {
+    override buildTokens(grammar: GrammarAST.Grammar, options?: { caseInsensitive?: boolean }): TokenVocabulary {
         const tokenTypes = super.buildTokens(grammar, options);
 
         if(isTokenTypeArray(tokenTypes)) {
-            // Regular mode just drops rich text middle & end
+            // Regular mode just drops template literal middle & end
             const regularModeTokens = tokenTypes
-                .filter(token => !['RICH_TEXT_INBETWEEN','RICH_TEXT_END'].includes(token.name));
-            // Template mode needs to exclude the '}' keyword, which causes confusion while lexing
+                .filter(token => !['TEMPLATE_LITERAL_MIDDLE','TEMPLATE_LITERAL_END'].includes(token.name));
+            // Template mode needs to exclude the '}' keyword
             const templateModeTokens = tokenTypes
                 .filter(token => !['}'].includes(token.name));
 
@@ -84,33 +114,62 @@ export class CustomTokenBuilder extends DefaultTokenBuilder {
     }
 
     protected override buildKeywordToken(
-        keyword: Keyword,
+        keyword: GrammarAST.Keyword,
         terminalTokens: TokenType[],
         caseInsensitive: boolean
     ): TokenType {
         let tokenType = super.buildKeywordToken(keyword, terminalTokens, caseInsensitive);
         
         if (tokenType.name === '}') {
-            // The default } token will use [RICH_TEXT_INBETWEEN, RICH_TEXT_END] as longer alts
+            // The default } token will use [TEMPLATE_LITERAL_MIDDLE, TEMPLATE_LITERAL_END] as longer alts
             // We need to delete the LONGER_ALT, they are not valid for the regular lexer mode
             delete tokenType.LONGER_ALT;
         }
-
         return tokenType;
     }
 
-    protected override buildTerminalToken(terminal: TerminalRule): TokenType {
+    protected override buildTerminalToken(terminal: GrammarAST.TerminalRule): TokenType {
         let tokenType = super.buildTerminalToken(terminal);
 
         // Update token types to enter & exit template mode
-        if(tokenType.name === 'RICH_TEXT_START') {
+        if(tokenType.name === 'TEMPLATE_LITERAL_START') {
             tokenType.PUSH_MODE = TEMPLATE_MODE;
-        } else if(tokenType.name === 'RICH_TEXT_END') {
+        } else if(tokenType.name === 'TEMPLATE_LITERAL_END') {
             tokenType.POP_MODE = true;
         }
-
         return tokenType;
     }
+}
+```
+
+With this change in place, the parser will work as expected. There is one last issue which we need to resolve in order to get everything working perfectly.
+When inspecting our AST, the `TemplateLiteral` object will contain strings with input artifacts in there (mainly `` ` ``, `{` and `}`).
+These aren't actually part of the semantic value of these strings, so we should get rid of them.
+We will need to create a custom `ValueConverter` and remove these artifacts:
+
+```ts
+import { CstNode, GrammarAST, DefaultValueConverter, ValueType, convertString } from 'langium';
+
+export class CustomValueConverter extends DefaultValueConverter {
 
+    protected override runConverter(rule: GrammarAST.AbstractRule, input: string, cstNode: CstNode): ValueType {
+        if (rule.name.startsWith('TEMPLATE_LITERAL')) {
+            // 'convertString' simply removes the first and last character of the input
+            return convertString(input);
+        } else {
+            return super.runConverter(rule, input, cstNode);
+        }
+    }
 }
 ```
+
+Of course, let's not forget to bind all of these services:
+
+```ts
+export const CustomModule = {
+    parser: {
+        TokenBuilder: () => new CustomTokenBuilder(),
+        ValueConverter: () => new CustomValueConverter()
+    },
+};
+```
\ No newline at end of file

From ef14db1549b1ae177325cae024ab51a2225ab53b Mon Sep 17 00:00:00 2001
From: Mark Sujew <mark.sujew@typefox.io>
Date: Mon, 13 Feb 2023 00:18:42 +0000
Subject: [PATCH 3/3] Improve syntax

---
 hugo/content/guides/multi-mode-lexing.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hugo/content/guides/multi-mode-lexing.md b/hugo/content/guides/multi-mode-lexing.md
index 203f2131..8f605c8c 100644
--- a/hugo/content/guides/multi-mode-lexing.md
+++ b/hugo/content/guides/multi-mode-lexing.md
@@ -143,7 +143,7 @@ export class CustomTokenBuilder extends DefaultTokenBuilder {
 ```
 
 With this change in place, the parser will work as expected. There is one last issue which we need to resolve in order to get everything working perfectly.
-When inspecting our AST, the `TemplateLiteral` object will contain strings with input artifacts in there (mainly `` ` ``, `{` and `}`).
+When inspecting our AST, the `TemplateLiteral` object will contain strings with input artifacts in there; mainly `` ` ``, `{` and `}`.
 These aren't actually part of the semantic value of these strings, so we should get rid of them.
 We will need to create a custom `ValueConverter` and remove these artifacts: