diff --git a/examples/arithmetics/syntaxes/arithmetics.tmLanguage.json b/examples/arithmetics/syntaxes/arithmetics.tmLanguage.json index c806b6ff9..b0f53490f 100644 --- a/examples/arithmetics/syntaxes/arithmetics.tmLanguage.json +++ b/examples/arithmetics/syntaxes/arithmetics.tmLanguage.json @@ -10,7 +10,7 @@ }, { "name": "keyword.control.arithmetics", - "match": "\\b([dD][eE][fF]|[mM][oO][dD][uU][lL][eE])\\b" + "match": "(?i)\\b(def|module)\\b" } ], "repository": { diff --git a/packages/langium-cli/src/generator/highlighting/textmate-generator.ts b/packages/langium-cli/src/generator/highlighting/textmate-generator.ts index e88d42056..03f2a6f04 100644 --- a/packages/langium-cli/src/generator/highlighting/textmate-generator.ts +++ b/packages/langium-cli/src/generator/highlighting/textmate-generator.ts @@ -128,14 +128,14 @@ function getRepository(grammar: Grammar, config: LangiumLanguageConfig): Reposit function getControlKeywords(grammar: Grammar, pack: LangiumLanguageConfig): Pattern { const regex = /[A-Za-z]/; const controlKeywords = collectKeywords(grammar).filter(kw => regex.test(kw)); - const groups = groupKeywords(controlKeywords, pack.caseInsensitive); + const groups = groupKeywords(controlKeywords); return { 'name': `keyword.control.${pack.id}`, - 'match': groups.join('|') + 'match': `${pack.caseInsensitive ? '(?i)' : ''}${groups.join('|')}` }; } -function groupKeywords(keywords: string[], caseInsensitive: boolean | undefined): string[] { +function groupKeywords(keywords: string[]): string[] { const groups: { letter: string[], leftSpecial: string[], @@ -144,7 +144,7 @@ function groupKeywords(keywords: string[], caseInsensitive: boolean | undefined) } = { letter: [], leftSpecial: [], rightSpecial: [], special: [] }; keywords.forEach(keyword => { - const keywordPattern = caseInsensitive ? RegExpUtils.getCaseInsensitivePattern(keyword) : RegExpUtils.escapeRegExp(keyword); + const keywordPattern = RegExpUtils.escapeRegExp(keyword); if (/\w/.test(keyword[0])) { if (/\w/.test(keyword[keyword.length - 1])) { groups.letter.push(keywordPattern); diff --git a/packages/langium/src/parser/token-builder.ts b/packages/langium/src/parser/token-builder.ts index 759e17fce..442a9757f 100644 --- a/packages/langium/src/parser/token-builder.ts +++ b/packages/langium/src/parser/token-builder.ts @@ -11,7 +11,7 @@ import { Lexer } from 'chevrotain'; import { isKeyword, isParserRule, isTerminalRule } from '../languages/generated/ast.js'; import { streamAllContents } from '../utils/ast-utils.js'; import { getAllReachableRules, terminalRegex } from '../utils/grammar-utils.js'; -import { getCaseInsensitivePattern, isWhitespace, partialMatches } from '../utils/regexp-utils.js'; +import { escapeRegExp, isWhitespace, partialMatches } from '../utils/regexp-utils.js'; import { stream } from '../utils/stream.js'; export interface TokenBuilderOptions { @@ -53,14 +53,10 @@ export class DefaultTokenBuilder implements TokenBuilder { const terminalTokens: TokenType[] = this.buildTerminalTokens(reachableRules); const tokens: TokenType[] = this.buildKeywordTokens(reachableRules, terminalTokens, options); - terminalTokens.forEach(terminalToken => { - const pattern = terminalToken.PATTERN; - if (typeof pattern === 'object' && pattern && 'test' in pattern && isWhitespace(pattern)) { - tokens.unshift(terminalToken); - } else { - tokens.push(terminalToken); - } - }); + // Add all terminals tokens to the end in the order they were defined + // Chevrotain documentation recommends to add Whitespace-like tokens at the start + // However, assuming the lexer is able to optimize the tokens, it should not matter + tokens.push(...terminalTokens); // We don't need to add the EOF token explicitly. // It is automatically available at the end of the token stream. return tokens; @@ -148,7 +144,7 @@ export class DefaultTokenBuilder implements TokenBuilder { protected buildKeywordPattern(keyword: Keyword, caseInsensitive: boolean): TokenPattern { return caseInsensitive ? - new RegExp(getCaseInsensitivePattern(keyword.value)) : + new RegExp(escapeRegExp(keyword.value), 'i') : keyword.value; } diff --git a/packages/langium/src/utils/regexp-utils.ts b/packages/langium/src/utils/regexp-utils.ts index 16e097d86..86781d726 100644 --- a/packages/langium/src/utils/regexp-utils.ts +++ b/packages/langium/src/utils/regexp-utils.ts @@ -155,12 +155,6 @@ export function escapeRegExp(value: string): string { return value.replace(/[.*+?^${}()|[\]\\]/g, '\\$&'); } -export function getCaseInsensitivePattern(keyword: string): string { - return Array.prototype.map.call(keyword, letter => - /\w/.test(letter) ? `[${letter.toLowerCase()}${letter.toUpperCase()}]` : escapeRegExp(letter) - ).join(''); -} - /** * Determines whether the given input has a partial match with the specified regex. * @param regex The regex to partially match against diff --git a/packages/langium/test/parser/token-builder.test.ts b/packages/langium/test/parser/token-builder.test.ts index 426a6ee7f..733edbe3f 100644 --- a/packages/langium/test/parser/token-builder.test.ts +++ b/packages/langium/test/parser/token-builder.test.ts @@ -140,23 +140,23 @@ describe('tokenBuilder#caseInsensitivePattern', () => { }); test('should create from keyword with special symbols', () => { - expect(implementPattern).toEqual(/@[iI][mM][pP][lL][eE][mM][eE][nN][tT]/); + expect(implementPattern).toEqual(/@implement/i); }); test('should create from keyword with special escape symbols', () => { - expect(strangePattern).toEqual(/\\[sS][tT][rR][aA][nN][gG][eE]\\/); + expect(strangePattern).toEqual(/\\strange\\/i); }); test('should create from mixed-case word', () => { - expect(abcPattern).toEqual(/[aA][bB][cC]/); + expect(abcPattern).toEqual(/AbC/i); }); test('should create from lower-case word', () => { - expect(abPattern).toEqual(/[aA][bB]/); + expect(abPattern).toEqual(/ab/i); }); test('should create from upper-case word', () => { - expect(aPattern).toEqual(/[aA]/); + expect(aPattern).toEqual(/A/i); }); test('should ignore terminals', () => {