add support for source phase imports

guybedford · Mar 25, 2024 · 96347a4 · 96347a4
1 parent f44438c
commit 96347a4
Show file tree

Hide file tree

Showing 7 changed files with 234 additions and 115 deletions.
diff --git a/README.md b/README.md
@@ -6,6 +6,8 @@ A JS module syntax lexer used in [es-module-shims](https://github.com/guybedford
 
 Outputs the list of exports and locations of import specifiers, including dynamic import and import meta handling.
 
+Supports new syntax features including import attributes and source phase imports.
+
 A very small single JS file (4KiB gzipped) that includes inlined Web Assembly for very fast source analysis of ECMAScript module syntax only.
 
 For an example of the performance, Angular 1 (720KiB) is fully parsed in 5ms, in comparison to the fastest JS parser, Acorn which takes over 100ms.
@@ -20,6 +22,8 @@ _Comprehensively handles the JS language grammar while remaining small and fast.
 npm install es-module-lexer
 ```
 
+See [types/lexer.d.ts](types/lexer.d.ts) for the type definitions.
+
 For use in CommonJS:
 
 ```js
@@ -60,6 +64,10 @@ import { init, parse } from 'es-module-lexer';
     // Comments provided to demonstrate edge cases
     import /*comment!*/ (  'asdf', { assert: { type: 'json' }});
     import /*comment!*/.meta.asdf;
+
+    // Source phase imports:
+    import source mod from './mod.wasm';
+    import.source('./mod.wasm);
   `;
 
   const [imports, exports] = parse(source, 'optional-sourcename');
@@ -98,10 +106,10 @@ import { init, parse } from 'es-module-lexer';
   // Returns -1
   exports[2].le;
 
-  // Dynamic imports are indicated by imports[2].d > -1
-  // In this case the "d" index is the start of the dynamic import bracket
+  // Import type is provided by `t` value
+  // (1 for static, 2, for dynamic)
   // Returns true
-  imports[2].d > -1;
+  imports[2].t == 2;
 
   // Returns "asdf" (only for string literal dynamic imports)
   imports[2].n
@@ -128,6 +136,13 @@ import { init, parse } from 'es-module-lexer';
   // Returns "import /*comment!*/.meta"
   source.slice(imports[4].s, imports[4].e);
   // ss and se are the same for import meta
+
+  // Returns "'./mod.wasm'"
+  source.slice(imports[5].s, imports[5].e);
+
+  // Import type 4 and 5 for static and dynamic source phase
+  imports[5].t === 4;
+  imports[6].t === 5;
 })();
 ```
 

diff --git a/chompfile.toml b/chompfile.toml
@@ -52,7 +52,7 @@ dep = 'src/lexer.ts'
 # even when we set "source-maps = false", so for now we have ejected the
 # template to its raw "run" command, and added an "rm" step.
 run = '''
-node ./node_modules/@swc/cli/bin/swc.js $DEP -o $TARGET --no-swcrc -C jsc.parser.syntax=typescript -C jsc.parser.importAssertions=true -C jsc.parser.topLevelAwait=true -C jsc.parser.importMeta=true -C jsc.parser.privateMethod=true -C jsc.parser.dynamicImport=true -C jsc.target=es2016 -C jsc.experimental.keepImportAttributes=true
+node ./node_modules/@swc/cli/bin/swc.js $DEP -o $TARGET --no-swcrc -C jsc.parser.syntax=typescript -C jsc.parser.importAssertions=true -C jsc.parser.topLevelAwait=true -C jsc.parser.importMeta=true -C jsc.parser.privateMethod=true -C jsc.parser.dynamicImport=true -C jsc.target=es2016 -C jsc.experimental.keepImportAssertions=true
 '''
 
 [[task]]
@@ -96,7 +96,7 @@ deps = ['src/lexer.h', 'src/lexer.c']
 run = """
 	${{ WASI_PATH }}/bin/clang src/lexer.c --sysroot=${{ WASI_PATH }}/share/wasi-sysroot -o lib/lexer.wasm -nostartfiles \
 	"-Wl,-z,stack-size=13312,--no-entry,--compress-relocations,--strip-all,\
-	--export=parse,--export=sa,--export=e,--export=ri,--export=re,--export=is,--export=ie,--export=ss,--export=ip,--export=se,--export=ai,--export=id,--export=es,--export=ee,--export=els,--export=ele,--export=f,--export=ms,--export=__heap_base" \
+	--export=parse,--export=sa,--export=e,--export=ri,--export=re,--export=is,--export=ie,--export=it,--export=ss,--export=ip,--export=se,--export=ai,--export=id,--export=es,--export=ee,--export=els,--export=ele,--export=f,--export=ms,--export=__heap_base" \
 	-Wno-logical-op-parentheses -Wno-parentheses \
 	-Oz
 """

diff --git a/lib/lexer.wasm b/lib/lexer.wasm
diff --git a/src/lexer.c b/src/lexer.c
@@ -28,6 +28,7 @@ static const char16_t BREA[] = { 'b', 'r', 'e', 'a' };
 static const char16_t CONTIN[] = { 'c', 'o', 'n', 't', 'i', 'n' };
 static const char16_t SYNC[] = {'s', 'y', 'n', 'c'};
 static const char16_t UNCTION[] = {'u', 'n', 'c', 't', 'i', 'o', 'n'};
+static const char16_t OURCE[] = {'o', 'u', 'r', 'c', 'e'};
 
 // Note: parsing is based on the _assumption_ that the source is already valid
 bool parse () {
@@ -239,124 +240,136 @@ void tryParseImportStatement () {
 
   char16_t ch = commentWhitespace(true);
 
-  switch (ch) {
-    // dynamic import
-    case '(':
-      openTokenStack[openTokenDepth].token = ImportParen;
-      openTokenStack[openTokenDepth++].pos = pos;
-      if (*lastTokenPos == '.')
-        return;
-      // dynamic import indicated by positive d
-      char16_t* dynamicPos = pos;
-      // try parse a string, to record a safe dynamic import string
-      pos++;
-      ch = commentWhitespace(true);
-      addImport(startPos, pos, 0, dynamicPos);
-      dynamicImportStack[dynamicImportStackDepth++] = import_write_head;
-      if (ch == '\'') {
-        stringLiteral(ch);
-      }
-      else if (ch == '"') {
-        stringLiteral(ch);
-      }
-      else {
-        pos--;
-        return;
-      }
-      pos++;
-      char16_t* endPos = pos;
+  bool source_keyword = false;
+
+  if (ch == '.') {
+    // import.meta
+    pos++;
+    ch = commentWhitespace(true);
+    // import.meta indicated by d == -2
+    if (ch == 'm' && memcmp(pos + 1, &ETA[0], 3 * 2) == 0 && (isSpread(lastTokenPos) || *lastTokenPos != '.')) {
+      addImport(startPos, startPos, pos + 4, IMPORT_META);
+      return;
+    }
+    else if (ch == 's' && memcmp(pos + 1, &OURCE[0], 5 * 2) == 0 && (isSpread(lastTokenPos) || *lastTokenPos != '.')) {
+      source_keyword = true;
+      pos += 6;
       ch = commentWhitespace(true);
-      if (ch == ',') {
-        pos++;
-        ch = commentWhitespace(true);
-        import_write_head->end = endPos;
-        import_write_head->assert_index = pos;
-        import_write_head->safe = true;
-        pos--;
-      }
-      else if (ch == ')') {
-        openTokenDepth--;
-        import_write_head->end = endPos;
-        import_write_head->statement_end = pos + 1;
-        import_write_head->safe = true;
-        dynamicImportStackDepth--;
-      }
-      else {
-        pos--;
-      }
+    }
+    else {
       return;
-    // import.meta
-    case '.':
+    }
+  }
+  else if (pos > startPos + 6 && ch == 's' && memcmp(pos + 1, &OURCE[0], 5 * 2) == 0 && isBrOrWs(*(pos + 6))) {
+    source_keyword = true;
+    pos += 6;
+    ch = commentWhitespace(true);
+  }
+
+  // dynamic import
+  if (ch == '(') {
+    openTokenStack[openTokenDepth].token = ImportParen;
+    openTokenStack[openTokenDepth++].pos = pos;
+    if (*lastTokenPos == '.')
+      return;
+    // dynamic import indicated by positive d
+    char16_t* dynamicPos = pos;
+    // try parse a string, to record a safe dynamic import string
+    pos++;
+    ch = commentWhitespace(true);
+    addImport(startPos, pos, 0, dynamicPos);
+    if (source_keyword)
+      import_write_head->import_ty = DynamicSourcePhase;
+    dynamicImportStack[dynamicImportStackDepth++] = import_write_head;
+    if (ch == '\'') {
+      stringLiteral(ch);
+    }
+    else if (ch == '"') {
+      stringLiteral(ch);
+    }
+    else {
+      pos--;
+      return;
+    }
+    pos++;
+    char16_t* endPos = pos;
+    ch = commentWhitespace(true);
+    if (ch == ',') {
       pos++;
       ch = commentWhitespace(true);
-      // import.meta indicated by d == -2
-      if (ch == 'm' && memcmp(pos + 1, &ETA[0], 3 * 2) == 0 && (isSpread(lastTokenPos) || *lastTokenPos != '.'))
-        addImport(startPos, startPos, pos + 4, IMPORT_META);
+      import_write_head->end = endPos;
+      import_write_head->assert_index = pos;
+      import_write_head->safe = true;
+      pos--;
+    }
+    else if (ch == ')') {
+      openTokenDepth--;
+      import_write_head->end = endPos;
+      import_write_head->statement_end = pos + 1;
+      import_write_head->safe = true;
+      dynamicImportStackDepth--;
+    }
+    else {
+      pos--;
+    }
+    return;
+  }
+
+  if (ch == '{' && !source_keyword) {
+    // import statement only permitted at base-level
+    if (openTokenDepth != 0) {
+      pos--;
       return;
+    }
 
-    default:
-      // no space after "import" -> not an import keyword
-      if (pos == startPos + 6) {
-        pos--;
-        break;
-      }
-    case '"':
-    case '\'':
-    case '*': {
-      // import statement only permitted at base-level
-      if (openTokenDepth != 0) {
-        pos--;
-        return;
-      }
-      while (pos < end) {
-        ch = *pos;
-        if (isQuote(ch)) {
-          readImportString(startPos, ch);
-          return;
-        }
+    while (pos < end) {
+      ch = commentWhitespace(true);
+      if (isQuote(ch)) {
+        stringLiteral(ch);
+      } else if (ch == '}') {
         pos++;
+        break;
       }
-      syntaxError();
-      break;
+      pos++;
     }
 
-    case '{': {
-      // import statement only permitted at base-level
-      if (openTokenDepth != 0) {
-        pos--;
-        return;
-      }
-
-      while (pos < end) {
-        ch = commentWhitespace(true);
+    ch = commentWhitespace(true);
+    if (ch == 'f' && memcmp(pos + 1, &ROM[0], 3 * 2) != 0) {
+      syntaxError();
+      return;
+    }
 
-        if (isQuote(ch)) {
-          stringLiteral(ch);
-        } else if (ch == '}') {
-          pos++;
-          break;
-        }
+    pos += 4;
+    ch = commentWhitespace(true);
 
-        pos++;
-      }
+    if (!isQuote(ch)) {
+      return syntaxError();
+    }
 
-      ch = commentWhitespace(true);
-      if (ch == 'f' && memcmp(pos + 1, &ROM[0], 3 * 2) != 0) {
-        syntaxError();
-        break;
+    readImportString(startPos, ch, false);
+  }
+  else {
+    if (source_keyword || !(ch == '"' || ch == '\'' || ch == '*')) {
+      // no space after "import" -> not an import keyword
+      if (pos == startPos + (source_keyword ? 12 : 6)) {
+        pos--;
+        return;
       }
-
-      pos += 4;
-      ch = commentWhitespace(true);
-
-      if (!isQuote(ch)) {
-        return syntaxError();
+    }
+    // import statement only permitted at base-level
+    if (openTokenDepth != 0 ) {
+      pos--;
+      return;
+    }
+    while (pos < end) {
+      ch = *pos;
+      if (isQuote(ch)) {
+        readImportString(startPos, ch, source_keyword);
+        return;
       }
-
-      readImportString(startPos, ch);
-
-      break;
+      pos++;
     }
+    syntaxError();
   }
 }
 
@@ -572,7 +585,7 @@ void tryParseExportStatement () {
   // from ...
   if (ch == 'f' && memcmp(pos + 1, &ROM[0], 3 * 2) == 0) {
     pos += 4;
-    readImportString(sStartPos, commentWhitespace(true));
+    readImportString(sStartPos, commentWhitespace(true), false);
 
     // There were no local names.
     for (Export* exprt = prev_export_write_head == NULL ? first_export : prev_export_write_head->next; exprt != NULL; exprt = exprt->next) {
@@ -619,7 +632,7 @@ char16_t readExportAs (char16_t* startPos, char16_t* endPos) {
   return ch;
 }
 
-void readImportString (const char16_t* ss, char16_t ch) {
+void readImportString (const char16_t* ss, char16_t ch, bool source_phase) {
   const char16_t* startPos = pos + 1;
   if (ch == '\'') {
     stringLiteral(ch);
@@ -632,6 +645,9 @@ void readImportString (const char16_t* ss, char16_t ch) {
     return;
   }
   addImport(ss, startPos, pos, STANDARD_IMPORT);
+  if (source_phase) {
+    import_write_head->import_ty = StaticSourcePhase;
+  }
   pos++;
   ch = commentWhitespace(false);
   if (!(ch == 'a' && memcmp(pos + 1, &SSERT[0], 5 * 2) == 0) && !(ch == 'w' && *(pos + 1) == 'i' && *(pos + 2) == 't' && *(pos + 3) == 'h')) {