From 8a80a6651fda70972fa391cdb898a8e1de715829 Mon Sep 17 00:00:00 2001 From: Mark Sujew Date: Sat, 11 Feb 2023 15:06:16 +0000 Subject: [PATCH 1/3] Add guide for multi mode lexing --- hugo/content/guides/multi-mode-lexing.md | 116 +++++++++++++++++++++++ 1 file changed, 116 insertions(+) create mode 100644 hugo/content/guides/multi-mode-lexing.md diff --git a/hugo/content/guides/multi-mode-lexing.md b/hugo/content/guides/multi-mode-lexing.md new file mode 100644 index 00000000..a0e71368 --- /dev/null +++ b/hugo/content/guides/multi-mode-lexing.md @@ -0,0 +1,116 @@ +--- +title: "Multi-Mode Lexing" +weight: 400 +--- + +Many modern programming languages such as [JavaScript](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Template_literals) or [C#](https://learn.microsoft.com/en-us/dotnet/csharp/language-reference/tokens/interpolated) support template literals. +They are a way to easily concatenate or interpolate string values while maintaining great code readability. +This guide will show you how to support template literals in Langium. + +```antlr +TemplateLiteral: + // Either just the full content + content+=TemplateContent | + // Or template string parts with expressions in between + ( + content+=TemplateContentStart + content+=Expression? + ( + content+=TemplateContentMiddle + content+=Expression? + )* + content+=TemplateContentEnd + ); + +TemplateContent returns TextLiteral: + value=RICH_TEXT; + +TemplateContentStart returns TextLiteral: + value=RICH_TEXT_START; + +TemplateContentMiddle returns TextLiteral: + value=RICH_TEXT_INBETWEEN; + +TemplateContentEnd returns TextLiteral: + value=RICH_TEXT_END; + +terminal RICH_TEXT: + '`' IN_RICH_TEXT* '`'; + +terminal RICH_TEXT_START: + '`' IN_RICH_TEXT* '{'; + +terminal RICH_TEXT_INBETWEEN: + '}' IN_RICH_TEXT* '{'; + +terminal RICH_TEXT_END: + '}' IN_RICH_TEXT* '`'; + +terminal fragment IN_RICH_TEXT: + /[^{`]|{{|``/; +``` + +```ts +import { DefaultTokenBuilder, Grammar, isTokenTypeArray, Keyword, TerminalRule } from "langium"; +import { IMultiModeLexerDefinition, TokenType, TokenVocabulary } from "chevrotain"; + +const REGULAR_MODE = 'regular_mode'; +const TEMPLATE_MODE = 'template_mode'; + +export class CustomTokenBuilder extends DefaultTokenBuilder { + + override buildTokens(grammar: Grammar, options?: { caseInsensitive?: boolean }): TokenVocabulary { + const tokenTypes = super.buildTokens(grammar, options); + + if(isTokenTypeArray(tokenTypes)) { + // Regular mode just drops rich text middle & end + const regularModeTokens = tokenTypes + .filter(token => !['RICH_TEXT_INBETWEEN','RICH_TEXT_END'].includes(token.name)); + // Template mode needs to exclude the '}' keyword, which causes confusion while lexing + const templateModeTokens = tokenTypes + .filter(token => !['}'].includes(token.name)); + + const multiModeLexerDef: IMultiModeLexerDefinition = { + modes: { + [REGULAR_MODE]: regularModeTokens, + [TEMPLATE_MODE]: templateModeTokens + }, + defaultMode: REGULAR_MODE + }; + return multiModeLexerDef; + } else { + throw new Error('Invalid token vocabulary received from DefaultTokenBuilder!'); + } + } + + protected override buildKeywordToken( + keyword: Keyword, + terminalTokens: TokenType[], + caseInsensitive: boolean + ): TokenType { + let tokenType = super.buildKeywordToken(keyword, terminalTokens, caseInsensitive); + + if (tokenType.name === '}') { + // The default } token will use [RICH_TEXT_INBETWEEN, RICH_TEXT_END] as longer alts + // We need to delete the LONGER_ALT, they are not valid for the regular lexer mode + delete tokenType.LONGER_ALT; + } + + return tokenType; + } + + protected override buildTerminalToken(terminal: TerminalRule): TokenType { + let tokenType = super.buildTerminalToken(terminal); + + // Update token types to enter & exit template mode + if(tokenType.name === 'RICH_TEXT_START') { + tokenType.PUSH_MODE = TEMPLATE_MODE; + } else if(tokenType.name === 'RICH_TEXT_END') { + tokenType.POP_MODE = true; + } + + return tokenType; + } + +} +``` From 746f1ed65b686bb57eeb719905037acb73f1fa0e Mon Sep 17 00:00:00 2001 From: Mark Sujew Date: Mon, 13 Feb 2023 00:15:52 +0000 Subject: [PATCH 2/3] Finish guide --- hugo/content/guides/multi-mode-lexing.md | 137 ++++++++++++++++------- 1 file changed, 98 insertions(+), 39 deletions(-) diff --git a/hugo/content/guides/multi-mode-lexing.md b/hugo/content/guides/multi-mode-lexing.md index a0e71368..203f2131 100644 --- a/hugo/content/guides/multi-mode-lexing.md +++ b/hugo/content/guides/multi-mode-lexing.md @@ -7,51 +7,81 @@ Many modern programming languages such as [JavaScript](https://developer.mozilla They are a way to easily concatenate or interpolate string values while maintaining great code readability. This guide will show you how to support template literals in Langium. +For this specific example, our template literal starts and ends using backticks `` ` `` and are interupted by expressions that are wrapped in curly braces `{}`. +So in our example, usage of template literals might look something like this: + +```js +println(`hello {name}!`); +``` + +Conceptually, template strings work by reading a start terminal which starts with `` ` `` and ends with `{`, +followed by an expression and then an end terminal which is effectively just the start terminal in reverse using `}` and `` ` ``. +Since we don't want to restrict users to only a single expression in their template literals, we also need a "middle" terminal reading from `}` to `{`. +Of course, there's also the option that a user only uses a template literal without any expressions in there. +So we additionally need a "full" terminal that reads from the start of the literal all the way to the end in one go. + +To achieve this, we will define a `TemplateLiteral` parser rule and a few terminals. +These terminals will adhere to the requirements that we just defined. +To make it a bit easier to read and maintain, we also define a special terminal fragment that we can reuse in all our terminal definitions: + ```antlr TemplateLiteral: // Either just the full content - content+=TemplateContent | - // Or template string parts with expressions in between + content+=TEMPLATE_LITERAL_FULL | + // Or template literal parts with expressions in between ( - content+=TemplateContentStart - content+=Expression? + content+=TEMPLATE_LITERAL_START + content+=Expression? ( - content+=TemplateContentMiddle + content+=TEMPLATE_LITERAL_MIDDLE content+=Expression? - )* - content+=TemplateContentEnd - ); + )* + content+=TEMPLATE_LITERAL_END + ) +; -TemplateContent returns TextLiteral: - value=RICH_TEXT; +terminal TEMPLATE_LITERAL_FULL: + '`' IN_TEMPLATE_LITERAL* '`'; -TemplateContentStart returns TextLiteral: - value=RICH_TEXT_START; +terminal TEMPLATE_LITERAL_START: + '`' IN_TEMPLATE_LITERAL* '{'; -TemplateContentMiddle returns TextLiteral: - value=RICH_TEXT_INBETWEEN; +terminal TEMPLATE_LITERAL_MIDDLE: + '}' IN_TEMPLATE_LITERAL* '{'; -TemplateContentEnd returns TextLiteral: - value=RICH_TEXT_END; +terminal TEMPLATE_LITERAL_END: + '}' IN_TEMPLATE_LITERAL* '`'; -terminal RICH_TEXT: - '`' IN_RICH_TEXT* '`'; +// '{{' is handled in a special way so we can escape normal '{' characters +// '``' is doing the same for the '`' character +terminal fragment IN_TEMPLATE_LITERAL: + /[^{`]|{{|``/; +``` -terminal RICH_TEXT_START: - '`' IN_RICH_TEXT* '{'; +If we go ahead and start parsing files with these changes, most things should work as expected. +However, depending on the structure of your existing grammar, some of these new terminals might be in conflict with existing terminals of your language. +For example, if your language supports block statements, chaining multiple blocks together will make this issue apparent: -terminal RICH_TEXT_INBETWEEN: - '}' IN_RICH_TEXT* '{'; +```js +{ + console.log('hi'); +} +{ + console.log('hello'); +} +``` -terminal RICH_TEXT_END: - '}' IN_RICH_TEXT* '`'; +The `} ... {` block in this example won't be parsed as separate `}` and `{` tokens, but instead as a single `TEMPLATE_LITERAL_MIDDLE` token, resulting in a parser error due to the unexpected token. +This doesn't make a lot of sense, since we aren't in the middle of a template literal at this point anyway. +However, our lexer doesn't know yet that the `TEMPLATE_LITERAL_MIDDLE` and `TEMPLATE_LITERAL_END` terminals are only allowed to show up within a `TemplateLiteral` rule. +To rectify this, we will need to make use of lexer modes. They will give us the necessary context to know whether we're inside a template literal or outside of it. +Depending on the current selected mode, we can lex different terminals. In our case, we want to exclude the `TEMPLATE_LITERAL_MIDDLE` and `TEMPLATE_LITERAL_END` terminals. -terminal fragment IN_RICH_TEXT: - /[^{`]|{{|``/; -``` +The following implementation of a `TokenBuilder` will do the job for us. It creates two lexing modes, which are almost identical except for the `TEMPLATE_LITERAL_MIDDLE` and `TEMPLATE_LITERAL_END` terminals. +We will also need to make sure that the modes are switched based on the `TEMPLATE_LITERAL_START` and `TEMPLATE_LITERAL_END` terminals. We use `PUSH_MODE` and `POP_MODE` for this. ```ts -import { DefaultTokenBuilder, Grammar, isTokenTypeArray, Keyword, TerminalRule } from "langium"; +import { DefaultTokenBuilder, isTokenTypeArray, GrammarAST } from "langium"; import { IMultiModeLexerDefinition, TokenType, TokenVocabulary } from "chevrotain"; const REGULAR_MODE = 'regular_mode'; @@ -59,14 +89,14 @@ const TEMPLATE_MODE = 'template_mode'; export class CustomTokenBuilder extends DefaultTokenBuilder { - override buildTokens(grammar: Grammar, options?: { caseInsensitive?: boolean }): TokenVocabulary { + override buildTokens(grammar: GrammarAST.Grammar, options?: { caseInsensitive?: boolean }): TokenVocabulary { const tokenTypes = super.buildTokens(grammar, options); if(isTokenTypeArray(tokenTypes)) { - // Regular mode just drops rich text middle & end + // Regular mode just drops template literal middle & end const regularModeTokens = tokenTypes - .filter(token => !['RICH_TEXT_INBETWEEN','RICH_TEXT_END'].includes(token.name)); - // Template mode needs to exclude the '}' keyword, which causes confusion while lexing + .filter(token => !['TEMPLATE_LITERAL_MIDDLE','TEMPLATE_LITERAL_END'].includes(token.name)); + // Template mode needs to exclude the '}' keyword const templateModeTokens = tokenTypes .filter(token => !['}'].includes(token.name)); @@ -84,33 +114,62 @@ export class CustomTokenBuilder extends DefaultTokenBuilder { } protected override buildKeywordToken( - keyword: Keyword, + keyword: GrammarAST.Keyword, terminalTokens: TokenType[], caseInsensitive: boolean ): TokenType { let tokenType = super.buildKeywordToken(keyword, terminalTokens, caseInsensitive); if (tokenType.name === '}') { - // The default } token will use [RICH_TEXT_INBETWEEN, RICH_TEXT_END] as longer alts + // The default } token will use [TEMPLATE_LITERAL_MIDDLE, TEMPLATE_LITERAL_END] as longer alts // We need to delete the LONGER_ALT, they are not valid for the regular lexer mode delete tokenType.LONGER_ALT; } - return tokenType; } - protected override buildTerminalToken(terminal: TerminalRule): TokenType { + protected override buildTerminalToken(terminal: GrammarAST.TerminalRule): TokenType { let tokenType = super.buildTerminalToken(terminal); // Update token types to enter & exit template mode - if(tokenType.name === 'RICH_TEXT_START') { + if(tokenType.name === 'TEMPLATE_LITERAL_START') { tokenType.PUSH_MODE = TEMPLATE_MODE; - } else if(tokenType.name === 'RICH_TEXT_END') { + } else if(tokenType.name === 'TEMPLATE_LITERAL_END') { tokenType.POP_MODE = true; } - return tokenType; } +} +``` + +With this change in place, the parser will work as expected. There is one last issue which we need to resolve in order to get everything working perfectly. +When inspecting our AST, the `TemplateLiteral` object will contain strings with input artifacts in there (mainly `` ` ``, `{` and `}`). +These aren't actually part of the semantic value of these strings, so we should get rid of them. +We will need to create a custom `ValueConverter` and remove these artifacts: + +```ts +import { CstNode, GrammarAST, DefaultValueConverter, ValueType, convertString } from 'langium'; + +export class CustomValueConverter extends DefaultValueConverter { + protected override runConverter(rule: GrammarAST.AbstractRule, input: string, cstNode: CstNode): ValueType { + if (rule.name.startsWith('TEMPLATE_LITERAL')) { + // 'convertString' simply removes the first and last character of the input + return convertString(input); + } else { + return super.runConverter(rule, input, cstNode); + } + } } ``` + +Of course, let's not forget to bind all of these services: + +```ts +export const CustomModule = { + parser: { + TokenBuilder: () => new CustomTokenBuilder(), + ValueConverter: () => new CustomValueConverter() + }, +}; +``` \ No newline at end of file From ef14db1549b1ae177325cae024ab51a2225ab53b Mon Sep 17 00:00:00 2001 From: Mark Sujew Date: Mon, 13 Feb 2023 00:18:42 +0000 Subject: [PATCH 3/3] Improve syntax --- hugo/content/guides/multi-mode-lexing.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hugo/content/guides/multi-mode-lexing.md b/hugo/content/guides/multi-mode-lexing.md index 203f2131..8f605c8c 100644 --- a/hugo/content/guides/multi-mode-lexing.md +++ b/hugo/content/guides/multi-mode-lexing.md @@ -143,7 +143,7 @@ export class CustomTokenBuilder extends DefaultTokenBuilder { ``` With this change in place, the parser will work as expected. There is one last issue which we need to resolve in order to get everything working perfectly. -When inspecting our AST, the `TemplateLiteral` object will contain strings with input artifacts in there (mainly `` ` ``, `{` and `}`). +When inspecting our AST, the `TemplateLiteral` object will contain strings with input artifacts in there; mainly `` ` ``, `{` and `}`. These aren't actually part of the semantic value of these strings, so we should get rid of them. We will need to create a custom `ValueConverter` and remove these artifacts: