remove use of go's "unsafe" package in the parser

evanw · Feb 10, 2022 · 4db57c1 · 4db57c1
1 parent 84a237e
commit 4db57c1
Show file tree

Hide file tree

Showing 5 changed files with 171 additions and 140 deletions.
diff --git a/internal/js_lexer/js_lexer.go b/internal/js_lexer/js_lexer.go
@@ -20,6 +20,7 @@ import (
 	"unicode"
 	"unicode/utf8"
 
+	"github.com/evanw/esbuild/internal/ast"
 	"github.com/evanw/esbuild/internal/config"
 	"github.com/evanw/esbuild/internal/helpers"
 	"github.com/evanw/esbuild/internal/js_ast"
@@ -216,10 +217,37 @@ type json struct {
 	allowComments bool
 }
 
+// This represents a string that is maybe a substring of the current file's
+// "source.Contents" string. The point of doing this is that if it is a
+// substring (the common case), then we can represent it more efficiently.
+//
+// For compactness and performance, the JS AST represents identifiers as a
+// symbol reference instead of as a string. However, we need to track the
+// string between the first pass and the second pass because the string is only
+// resolved to a symbol in the second pass. To avoid allocating extra memory
+// to store the string, we instead use an index+length slice of the original JS
+// source code. That index is what "Start" represents here. The length is just
+// "len(String)".
+//
+// Set "Start" to invalid (the zero value) if "String" is not a substring of
+// "source.Contents". This is the case for escaped identifiers. For example,
+// the identifier "fo\u006f" would be "MaybeSubstring{String: "foo"}". It's
+// critical that any code changing the "String" also set "Start" to the zero
+// value, which is best done by just overwriting the whole "MaybeSubstring".
+//
+// The substring range used to be recovered automatically from the string but
+// that relied on the Go "unsafe" package which can hypothetically break under
+// certain Go compiler optimization passes, so it has been removed and replaced
+// with this more error-prone approach that doesn't use "unsafe".
+type MaybeSubstring struct {
+	String string
+	Start  ast.Index32
+}
+
 type Lexer struct {
 	CommentsToPreserveBefore []js_ast.Comment
 	AllOriginalComments      []js_ast.Comment
-	Identifier               string
+	Identifier               MaybeSubstring
 	log                      logger.Log
 	source                   logger.Source
 	JSXFactoryPragmaComment  logger.Span
@@ -323,6 +351,10 @@ func (lexer *Lexer) Raw() string {
 	return lexer.source.Contents[lexer.start:lexer.end]
 }
 
+func (lexer *Lexer) rawIdentifier() MaybeSubstring {
+	return MaybeSubstring{lexer.Raw(), ast.MakeIndex32(uint32(lexer.start))}
+}
+
 func (lexer *Lexer) StringLiteral() []uint16 {
 	if lexer.decodedStringLiteralOrNil == nil {
 		// Lazily decode escape sequences if needed
@@ -1143,7 +1175,7 @@ func (lexer *Lexer) NextInsideJSXElement() {
 					lexer.step()
 				}
 
-				lexer.Identifier = lexer.Raw()
+				lexer.Identifier = lexer.rawIdentifier()
 				lexer.Token = TIdentifier
 				break
 			}
@@ -1185,7 +1217,7 @@ func (lexer *Lexer) Next() {
 						break hashbang
 					}
 				}
-				lexer.Identifier = lexer.Raw()
+				lexer.Identifier = lexer.rawIdentifier()
 			} else {
 				// "#foo"
 				lexer.step()
@@ -1202,7 +1234,7 @@ func (lexer *Lexer) Next() {
 					if lexer.codePoint == '\\' {
 						lexer.Identifier, _ = lexer.scanIdentifierWithEscapes(privateIdentifier)
 					} else {
-						lexer.Identifier = lexer.Raw()
+						lexer.Identifier = lexer.rawIdentifier()
 					}
 				}
 				lexer.Token = TPrivateIdentifier
@@ -1717,9 +1749,8 @@ func (lexer *Lexer) Next() {
 			if lexer.codePoint == '\\' {
 				lexer.Identifier, lexer.Token = lexer.scanIdentifierWithEscapes(normalIdentifier)
 			} else {
-				contents := lexer.Raw()
-				lexer.Identifier = contents
-				lexer.Token = Keywords[contents]
+				lexer.Identifier = lexer.rawIdentifier()
+				lexer.Token = Keywords[lexer.Raw()]
 				if lexer.Token == 0 {
 					lexer.Token = TIdentifier
 				}
@@ -1747,7 +1778,7 @@ func (lexer *Lexer) Next() {
 					lexer.Identifier, lexer.Token = lexer.scanIdentifierWithEscapes(normalIdentifier)
 				} else {
 					lexer.Token = TIdentifier
-					lexer.Identifier = lexer.Raw()
+					lexer.Identifier = lexer.rawIdentifier()
 				}
 				break
 			}
@@ -1769,7 +1800,7 @@ const (
 
 // This is an edge case that doesn't really exist in the wild, so it doesn't
 // need to be as fast as possible.
-func (lexer *Lexer) scanIdentifierWithEscapes(kind identifierKind) (string, T) {
+func (lexer *Lexer) scanIdentifierWithEscapes(kind identifierKind) (MaybeSubstring, T) {
 	// First pass: scan over the identifier to see how long it is
 	for {
 		// Scan a unicode escape sequence. There is at least one because that's
@@ -1848,9 +1879,9 @@ func (lexer *Lexer) scanIdentifierWithEscapes(kind identifierKind) (string, T) {
 	//   foo.\u0076\u0061\u0072;
 	//
 	if Keywords[text] != 0 {
-		return text, TEscapedKeyword
+		return MaybeSubstring{String: text}, TEscapedKeyword
 	} else {
-		return text, TIdentifier
+		return MaybeSubstring{String: text}, TIdentifier
 	}
 }
 
@@ -1978,7 +2009,7 @@ func (lexer *Lexer) parseNumericLiteralOrDot() {
 
 		// Slow path: do we need to re-scan the input as text?
 		if isBigIntegerLiteral || isInvalidLegacyOctalLiteral {
-			text := lexer.Raw()
+			text := lexer.rawIdentifier()
 
 			// Can't use a leading zero for bigint literals
 			if isBigIntegerLiteral && lexer.IsLegacyOctalLiteral {
@@ -1987,22 +2018,22 @@ func (lexer *Lexer) parseNumericLiteralOrDot() {
 
 			// Filter out underscores
 			if underscoreCount > 0 {
-				bytes := make([]byte, 0, len(text)-underscoreCount)
-				for i := 0; i < len(text); i++ {
-					c := text[i]
+				bytes := make([]byte, 0, len(text.String)-underscoreCount)
+				for i := 0; i < len(text.String); i++ {
+					c := text.String[i]
 					if c != '_' {
 						bytes = append(bytes, c)
 					}
 				}
-				text = string(bytes)
+				text = MaybeSubstring{String: string(bytes)}
 			}
 
 			// Store bigints as text to avoid precision loss
 			if isBigIntegerLiteral {
 				lexer.Identifier = text
 			} else if isInvalidLegacyOctalLiteral {
 				// Legacy octal literals may turn out to be a base 10 literal after all
-				value, _ := strconv.ParseFloat(text, 64)
+				value, _ := strconv.ParseFloat(text.String, 64)
 				lexer.Number = value
 			}
 		}
@@ -2099,23 +2130,23 @@ func (lexer *Lexer) parseNumericLiteralOrDot() {
 		}
 
 		// Take a slice of the text to parse
-		text := lexer.Raw()
+		text := lexer.rawIdentifier()
 
 		// Filter out underscores
 		if underscoreCount > 0 {
-			bytes := make([]byte, 0, len(text)-underscoreCount)
-			for i := 0; i < len(text); i++ {
-				c := text[i]
+			bytes := make([]byte, 0, len(text.String)-underscoreCount)
+			for i := 0; i < len(text.String); i++ {
+				c := text.String[i]
 				if c != '_' {
 					bytes = append(bytes, c)
 				}
 			}
-			text = string(bytes)
+			text = MaybeSubstring{String: string(bytes)}
 		}
 
 		if lexer.codePoint == 'n' && !hasDotOrExponent {
 			// The only bigint literal that can start with 0 is "0n"
-			if len(text) > 1 && first == '0' {
+			if len(text.String) > 1 && first == '0' {
 				lexer.SyntaxError()
 			}
 
@@ -2124,13 +2155,13 @@ func (lexer *Lexer) parseNumericLiteralOrDot() {
 		} else if !hasDotOrExponent && lexer.end-lexer.start < 10 {
 			// Parse a 32-bit integer (very fast path)
 			var number uint32 = 0
-			for _, c := range text {
+			for _, c := range text.String {
 				number = number*10 + uint32(c-'0')
 			}
 			lexer.Number = float64(number)
 		} else {
 			// Parse a double-precision floating-point number
-			value, _ := strconv.ParseFloat(text, 64)
+			value, _ := strconv.ParseFloat(text.String, 64)
 			lexer.Number = value
 		}
 	}

diff --git a/internal/js_lexer/js_lexer_test.go b/internal/js_lexer/js_lexer_test.go
@@ -85,7 +85,7 @@ func expectHashbang(t *testing.T, contents string, expected string) {
 		msgs := log.Done()
 		test.AssertEqual(t, len(msgs), 0)
 		test.AssertEqual(t, lexer.Token, THashbang)
-		test.AssertEqual(t, lexer.Identifier, expected)
+		test.AssertEqual(t, lexer.Identifier.String, expected)
 	})
 }
 
@@ -113,7 +113,7 @@ func expectIdentifier(t *testing.T, contents string, expected string) {
 		msgs := log.Done()
 		test.AssertEqual(t, len(msgs), 0)
 		test.AssertEqual(t, lexer.Token, TIdentifier)
-		test.AssertEqual(t, lexer.Identifier, expected)
+		test.AssertEqual(t, lexer.Identifier.String, expected)
 	})
 }
 
@@ -359,7 +359,7 @@ func expectBigInteger(t *testing.T, contents string, expected string) {
 		msgs := log.Done()
 		test.AssertEqual(t, len(msgs), 0)
 		test.AssertEqual(t, lexer.Token, TBigIntegerLiteral)
-		test.AssertEqual(t, lexer.Identifier, expected)
+		test.AssertEqual(t, lexer.Identifier.String, expected)
 	})
 }
 

diff --git a/internal/js_parser/global_name_parser.go b/internal/js_parser/global_name_parser.go
@@ -20,7 +20,7 @@ func ParseGlobalName(log logger.Log, source logger.Source) (result []string, ok
 	lexer := js_lexer.NewLexerGlobalName(log, source)
 
 	// Start off with an identifier
-	result = append(result, lexer.Identifier)
+	result = append(result, lexer.Identifier.String)
 	lexer.Expect(js_lexer.TIdentifier)
 
 	// Follow with dot or index expressions
@@ -31,7 +31,7 @@ func ParseGlobalName(log logger.Log, source logger.Source) (result []string, ok
 			if !lexer.IsIdentifierOrKeyword() {
 				lexer.Expect(js_lexer.TIdentifier)
 			}
-			result = append(result, lexer.Identifier)
+			result = append(result, lexer.Identifier.String)
 			lexer.Next()
 
 		case js_lexer.TOpenBracket: