Skip to content
Permalink
Browse files

Breaking: Supports Unicode BOM (fixes #4878)

- `eslint.verify()` API came to strip BOM before parsing.
- `SourceCode` constructor came to strip BOM of `text` argument.
- `hasBOM` property was added into `SourceCode` object.
- `SourceCodeFixer.applyFixes()` came to insert BOM to the head of output if `sourceCode.hasBOM` is `true`.
  • Loading branch information...
mysticatea committed Jan 13, 2016
1 parent 1d517fa commit 12fe80369f767722fe6a7886a68f39a7cc18e456
@@ -16,6 +16,18 @@ var code = new SourceCode("var foo = bar;", ast);

The `SourceCode` constructor throws an error if the AST is missing any of the required information.

The `SourceCode` constructor strips Unicode BOM.
Please note the AST also should be parsed from stripped text.

```js
var SourceCode = require("eslint").SourceCode;
var code = new SourceCode("\uFEFFvar foo = bar;", ast);
assert(code.hasBOM === true);
assert(code.text === "var foo = bar;");
```

### splitLines()

This is a static function on `SourceCode` that is used to split the source code text into an array of lines.
@@ -84,7 +96,6 @@ The `verify()` method returns an array of objects containing information about t
```js
{
fatal: false,
severity: 2,
ruleId: "semi",
severity: 2,
line: 1,
@@ -243,7 +243,8 @@ Once you have an instance of `SourceCode`, you can use the methods on it to work

There are also some properties you can access:

* `text` - the full text of the code being linted.
* `hasBOM` - the flag to indicate whether or not the source code has Unicode BOM.
* `text` - the full text of the code being linted. Unicode BOM has been stripped from this text.
* `ast` - the `Program` node of the AST for the code being linted.
* `lines` - an array of lines, split according to the specification's definition of line breaks.

@@ -264,3 +264,27 @@ ESLint 2.0.0 removes these conflicting defaults, and so you may begin seeing lin

[`no-multiple-empty-lines`]: ../rules/no-multiple-empty-lines
[`func-style`]: ../rules/func-style


## SourceCode constructor (Node API) changes

`SourceCode` constructor got to handle Unicode BOM.
If the first argument `text` has BOM, `SourceCode` constructor sets `true` to `this.hasBOM` and strips BOM from the text.

```js
var SourceCode = require("eslint").SourceCode;
var code = new SourceCode("\uFEFFvar foo = bar;", ast);
assert(code.hasBOM === true);
assert(code.text === "var foo = bar;");
```

So the second argument `ast` also should be parsed from stripped text.

**To address:** If you are using `SourceCode` constructor in your code, please parse the source code after it stripped BOM:

```js
var ast = yourParser.parse(text.replace(/^\uFEFF/, ""), options);
var sourceCode = new SourceCode(text, ast);
```
@@ -482,6 +482,22 @@ function findEslintEnv(text) {
return retv;
}

/**
* Strips Unicode BOM from a given text.
*
* @param {string} text - A text to strip.
* @returns {string} The stripped text.
*/
function stripUnicodeBOM(text) {
// Check Unicode BOM.
// In JavaScript, string data is stored as UTF-16, so BOM is 0xFEFF.
// http://www.ecma-international.org/ecma-262/6.0/#sec-unicode-format-control-characters
if (text.charCodeAt(0) === 0xFEFF) {
return text.slice(1);
}
return text;
}

//------------------------------------------------------------------------------
// Public Interface
//------------------------------------------------------------------------------
@@ -673,17 +689,19 @@ module.exports = (function() {

// only do this for text
if (text !== null) {

// there's no input, just exit here
if (text.trim().length === 0) {
sourceCode = new SourceCode(text, blankScriptAST);
return messages;
}

ast = parse(text.replace(/^#!([^\r\n]+)/, function(match, captured) {
shebang = captured;
return "//" + captured;
}), config);
ast = parse(
stripUnicodeBOM(text).replace(/^#!([^\r\n]+)/, function(match, captured) {
shebang = captured;
return "//" + captured;
}),
config
);

if (ast) {
sourceCode = new SourceCode(text, ast);
@@ -16,6 +16,8 @@ var debug = require("debug")("eslint:text-fixer");
// Helpers
//------------------------------------------------------------------------------

var BOM = "\uFEFF";

/**
* Compares items in a messages array by line and column.
* @param {Message} a The first message.
@@ -69,7 +71,8 @@ SourceCodeFixer.applyFixes = function(sourceCode, messages) {
var remainingMessages = [],
fixes = [],
text = sourceCode.text,
lastFixPos = text.length + 1;
lastFixPos = text.length + 1,
prefix = (sourceCode.hasBOM ? BOM : "");

messages.forEach(function(problem) {
if (problem.hasOwnProperty("fix")) {
@@ -96,10 +99,24 @@ SourceCodeFixer.applyFixes = function(sourceCode, messages) {

fixes.forEach(function(problem) {
var fix = problem.fix;

if (fix.range[1] < lastFixPos) {
chars.splice(fix.range[0], fix.range[1] - fix.range[0], fix.text);
lastFixPos = fix.range[0];
var start = fix.range[0];
var end = fix.range[1];
var insertionText = fix.text;

if (end < lastFixPos) {
if (start < 0) {
// Remove BOM.
prefix = "";
start = 0;
}
if (start === 0 && insertionText[0] === BOM) {
// Set BOM.
prefix = BOM;
insertionText = insertionText.slice(1);
}

chars.splice(start, end - start, insertionText);
lastFixPos = start;
} else {
remainingMessages.push(problem);
}
@@ -108,14 +125,14 @@ SourceCodeFixer.applyFixes = function(sourceCode, messages) {
return {
fixed: true,
messages: remainingMessages.sort(compareMessagesByLocation),
output: chars.join("")
output: prefix + chars.join("")
};
} else {
debug("No fixes to apply");
return {
fixed: false,
messages: messages,
output: text
output: prefix + text
};
}
};
@@ -87,19 +87,25 @@ function looksLikeExport(astNode) {

/**
* Represents parsed source code.
* @param {string} text The source code text.
* @param {ASTNode} ast The Program node of the AST representing the code.
* @param {string} text - The source code text.
* @param {ASTNode} ast - The Program node of the AST representing the code. This AST should be created from the text that BOM was stripped.
* @constructor
*/
function SourceCode(text, ast) {

validate(ast);

/**
* The flag to indicate that the source code has Unicode BOM.
* @type boolean
*/
this.hasBOM = (text.charCodeAt(0) === 0xFEFF);

/**
* The original text source code.
* BOM was stripped from this text.
* @type string
*/
this.text = text;
this.text = (this.hasBOM ? text.slice(1) : text);

/**
* The parsed AST for the source code.
@@ -112,7 +118,7 @@ function SourceCode(text, ast) {
* This is done to avoid each rule needing to do so separately.
* @type string[]
*/
this.lines = SourceCode.splitLines(text);
this.lines = SourceCode.splitLines(this.text);

this.tokensAndComments = ast.tokens.concat(ast.comments).sort(function(left, right) {
return left.range[0] - right.range[0];
@@ -0,0 +1,3 @@
"use strict";

console.log("This file has [0xEF, 0xBB, 0xBF] as BOM.");
@@ -71,7 +71,10 @@ ruleTester.run("no-irregular-whitespace", rule, {
"'\\\u2029';", // multiline string
"'\u202F';",
"'\u205f';",
"'\u3000';"
"'\u3000';",

// Unicode BOM.
"\uFEFFconsole.log('hello BOM');"
],

invalid: [
Oops, something went wrong.

0 comments on commit 12fe803

Please sign in to comment.
You can’t perform that action at this time.