Implement ident decode/encode

csstree · Jan 21, 2020 · 3355b15 · 3355b15
1 parent b9840cf
commit 3355b15
Show file tree

Hide file tree

Showing 5 changed files with 258 additions and 19 deletions.
diff --git a/lib/syntax/create.js b/lib/syntax/create.js
@@ -11,6 +11,7 @@ const createWalker = require('../walker/create');
 const clone = require('../utils/clone');
 const names = require('../utils/names');
 const mix = require('./config/mix');
+const ident = require('../utils/ident');
 const string = require('../utils/string');
 const url = require('../utils/url');
 
@@ -30,6 +31,7 @@ function createSyntax(config) {
         keyword: names.keyword,
         property: names.property,
         isCustomProperty: names.isCustomProperty,
+        ident,
         string,
         url,
 

diff --git a/lib/utils/ident.js b/lib/utils/ident.js
@@ -0,0 +1,105 @@
+const { consumeEscaped, decodeEscaped } = require('../tokenizer/utils');
+const {
+    isName,
+    isValidEscape
+} = require('../tokenizer/char-code-definitions');
+
+const REVERSE_SOLIDUS = 0x005c; // U+005C REVERSE SOLIDUS (\)
+
+function decode(str) {
+    const end = str.length - 1;
+    let decoded = '';
+
+    for (let i = 0; i < str.length; i++) {
+        let code = str.charCodeAt(i);
+
+        if (code === REVERSE_SOLIDUS) {
+            // special case at the ending
+            if (i === end) {
+                // if the next input code point is EOF, do nothing
+                break;
+            }
+
+            code = str.charCodeAt(++i);
+
+            // consume escaped
+            if (isValidEscape(REVERSE_SOLIDUS, code)) {
+                const escapeStart = i - 1;
+                const escapeEnd = consumeEscaped(str, escapeStart);
+
+                i = escapeEnd - 1;
+                decoded += decodeEscaped(str.substring(escapeStart + 1, escapeEnd));
+            } else {
+                // \r\n
+                if (code === 0x000d && str.charCodeAt(i + 1) === 0x000a) {
+                    i++;
+                }
+            }
+        } else {
+            decoded += str[i];
+        }
+    }
+
+    return decoded;
+}
+
+// https://drafts.csswg.org/cssom/#serialize-an-identifier
+// § 2.1. Common Serializing Idioms
+function encode(str) {
+    let encoded = '';
+
+    // If the character is the first character and is a "-" (U+002D),
+    // and there is no second character, then the escaped character.
+    // Note: That's means a single dash string "-" return as escaped dash,
+    // so move the condition out of the main loop
+    if (str.length === 1 && str.charCodeAt(0) === 0x002D) {
+        return '\\-';
+    }
+
+    // To serialize an identifier means to create a string represented
+    // by the concatenation of, for each character of the identifier:
+    for (let i = 0; i < str.length; i++) {
+        let code = str.charCodeAt(i);
+
+        // If the character is NULL (U+0000), then the REPLACEMENT CHARACTER (U+FFFD).
+        if (code === 0x0000) {
+            encoded += '\uFFFD';
+            continue;
+        }
+
+        if (
+            // If the character is in the range [\1-\1f] (U+0001 to U+001F) or is U+007F ...
+            // Note: Do not compare with 0x0001 since 0x0000 is precessed before
+            code <= 0x001F || code === 0x007F ||
+            // [or] ... is in the range [0-9] (U+0030 to U+0039),
+            (code >= 0x0030 && code <= 0x0039 && (
+                // If the character is the first character ...
+                i === 0 ||
+                // If the character is the second character ... and the first character is a "-" (U+002D)
+                i === 1 && str.charCodeAt(0) === 0x002D
+            ))
+        ) {
+            // ... then the character escaped as code point.
+            encoded += '\\' + code.toString(16) + ' ';
+            continue;
+        }
+
+        // If the character is not handled by one of the above rules and is greater
+        // than or equal to U+0080, is "-" (U+002D) or "_" (U+005F), or is in one
+        // of the ranges [0-9] (U+0030 to U+0039), [A-Z] (U+0041 to U+005A),
+        // or \[a-z] (U+0061 to U+007A), then the character itself.
+        if (isName(code)) {
+            encoded += str.charAt(i);
+        } else {
+            // Otherwise, the escaped character.
+            encoded += '\\' + str.charAt(i);
+        }
+    }
+
+    return encoded;
+}
+
+module.exports = {
+    decode,
+    encode
+};
diff --git a/lib/utils/string.js b/lib/utils/string.js
@@ -1,11 +1,10 @@
 const { consumeEscaped, decodeEscaped } = require('../tokenizer/utils');
 const {
     isHexDigit,
-    isNewline,
     isValidEscape
 } = require('../tokenizer/char-code-definitions');
 
-const REVERSE_SOLIDUS = 0x005c; // \
+const REVERSE_SOLIDUS = 0x005c; // U+005C REVERSE SOLIDUS (\)
 const QUOTATION_MARK = 0x0022;  // "
 const APOSTROPHE = 0x0027;      // '
 
@@ -62,17 +61,31 @@ function encode(str, apostrophe) {
     for (let i = 0; i < str.length; i++) {
         let code = str.charCodeAt(i);
 
-        if (isNewline(code)) {
+        // If the character is NULL (U+0000), then the REPLACEMENT CHARACTER (U+FFFD).
+        if (code === 0x0000) {
+            encoded += '\uFFFD';
+            continue;
+        }
+
+        // If the character is in the range [\1-\1f] (U+0001 to U+001F) or is U+007F,
+        // the character escaped as code point.
+        // Note: Do not compare with 0x0001 since 0x0000 is precessed before
+        if (code <= 0x001f || code === 0x007F) {
             encoded += '\\' + code.toString(16);
             wsBeforeHexIsNeeded = true;
-        } else if (code === REVERSE_SOLIDUS || code === quoteCode) {
+            continue;
+        }
+
+        // If the character is '"' (U+0022) or "\" (U+005C), the escaped character.
+        if (code === quoteCode || code === REVERSE_SOLIDUS) {
             encoded += '\\' + str.charAt(i);
             wsBeforeHexIsNeeded = false;
         } else {
             if (wsBeforeHexIsNeeded && isHexDigit(code)) {
                 encoded += ' ';
             }
 
+            // Otherwise, the character itself.
             encoded += str.charAt(i);
             wsBeforeHexIsNeeded = false;
         }

diff --git a/lib/utils/url.js b/lib/utils/url.js
@@ -2,12 +2,11 @@ const { consumeEscaped, decodeEscaped } = require('../tokenizer/utils');
 const {
     isHexDigit,
     isWhiteSpace,
-    isNonPrintable,
     isValidEscape
 } = require('../tokenizer/char-code-definitions');
 
 const SPACE = 0x0020;            // U+0020 SPACE
-const REVERSE_SOLIDUS = 0x005c;  // \
+const REVERSE_SOLIDUS = 0x005c;  // U+005C REVERSE SOLIDUS (\)
 const QUOTATION_MARK = 0x0022;   // "
 const APOSTROPHE = 0x0027;       // '
 const LEFTPARENTHESIS = 0x0028;  // U+0028 LEFT PARENTHESIS (()
@@ -34,7 +33,7 @@ function decode(str) {
             // special case at the ending
             if (i === end) {
                 // if the next input code point is EOF, do nothing
-                // otherwise include last quote as escaped
+                // otherwise include last left parenthesis as escaped
                 if (i !== len - 1) {
                     decoded = str.substr(i + 1);
                 }
@@ -71,15 +70,27 @@ function encode(str) {
     for (let i = 0; i < str.length; i++) {
         let code = str.charCodeAt(i);
 
-        if (isNonPrintable(code) || (isWhiteSpace(code) && code !== SPACE)) {
+        // If the character is NULL (U+0000), then the REPLACEMENT CHARACTER (U+FFFD).
+        if (code === 0x0000) {
+            encoded += '\uFFFD';
+            continue;
+        }
+
+        // If the character is in the range [\1-\1f] (U+0001 to U+001F) or is U+007F,
+        // the character escaped as code point.
+        // Note: Do not compare with 0x0001 since 0x0000 is precessed before
+        if (code <= 0x001f || code === 0x007F) {
             encoded += '\\' + code.toString(16);
             wsBeforeHexIsNeeded = true;
-        } else if (code === SPACE ||
-                   code === REVERSE_SOLIDUS ||
-                   code === QUOTATION_MARK ||
-                   code === APOSTROPHE ||
-                   code === LEFTPARENTHESIS ||
-                   code === RIGHTPARENTHESIS) {
+            continue;
+        }
+
+        if (code === SPACE ||
+            code === REVERSE_SOLIDUS ||
+            code === QUOTATION_MARK ||
+            code === APOSTROPHE ||
+            code === LEFTPARENTHESIS ||
+            code === RIGHTPARENTHESIS) {
             encoded += '\\' + str.charAt(i);
             wsBeforeHexIsNeeded = false;
         } else {

diff --git a/test/decode-encode.js b/test/decode-encode.js
@@ -1,5 +1,5 @@
 const assert = require('assert');
-const { string, url } = require('../lib');
+const { ident, string, url } = require('../lib');
 
 function forEachTest(tests, func) {
     Object.keys(tests).forEach((from, idx) => {
@@ -51,7 +51,9 @@ describe('decode/encode', () => {
                 // (30)
                 '"\\a\\d\\c\\9"': '\n\r\f\t',
                 '"\\(\\)\\\\"': '()\\',
-                '"\\\r\\\n\\\r\n"': ''
+                '"\\\r\\\n\\\r\n"': '',
+                '"\\"': '"',
+                '"\\': ''
             };
 
             forEachTest(tests, string.decode);
@@ -67,8 +69,8 @@ describe('decode/encode', () => {
                 // (5)
                 'a\rb': '"a\\d b"',
                 'a\fb': '"a\\c b"',
-                'a\tb': '"a\tb"',
-                'a\nbc\n"b\tx': '"a\\a bc\\a\\"b\tx"',
+                'a\tb': '"a\\9 b"',
+                'a\nbc\n"b\tx': '"a\\a bc\\a\\"b\\9x"',
                 'a\\26b': '"a\\\\26b"',
                 // (10)
                 'a&b': '"a&b"',
@@ -105,7 +107,10 @@ describe('decode/encode', () => {
                 'url(\\abcdefa)': '\ufffda',  // is greater than the maximum allowed code point
                 'url(\\def0)': '\ufffd',      // is for a surrogate
                 'url(\\00abcdef)': '\uabcdef',
-                'url(\\abcdef1)': '\ufffd1'
+                // (20)
+                'url(\\abcdef1)': '\ufffd1',
+                'url(\\)': ')',
+                'url(\\': ''
             };
 
             forEachTest(tests, url.decode);
@@ -133,4 +138,107 @@ describe('decode/encode', () => {
             forEachTest(tests, url.encode);
         });
     });
+
+    describe('ident', () => {
+        describe('decode', () => {
+            const tests = {
+                '': '',
+                'foo': 'foo',
+                'a\\\r\\\n\\\r\nb': 'ab',
+                '\\21': '!',
+                '\\021': '!',
+                // (5)
+                '\\0021': '!',
+                '\\00021': '!',
+                '\\000021': '!',
+                '\\0000211': '!1',
+                '\\000021 1': '!1',
+                // (10)
+                '\\000021\t1': '!1',
+                '\\0': '\ufffd',
+                '\\0x': '\ufffdx',
+                '\\abcdefa': '\ufffda',  // is greater than the maximum allowed code point
+                '\\def0': '\ufffd',      // is for a surrogate
+                // (15)
+                '\\00abcdef': '\uabcdef',
+                '\\abcdef1': '\ufffd1',
+                '\\': ''
+            };
+
+            forEachTest(tests, ident.decode);
+        });
+
+        describe('encode', () => {
+            // Adopted tests: https://github.com/mathiasbynens/CSS.escape/blob/master/tests/tests.js
+            const tests = {
+                '': '',
+                '\0': '\uFFFD',
+                'a\0': 'a\uFFFD',
+                '\0b': '\uFFFDb',
+                'a\0b': 'a\uFFFDb',
+
+                '\uFFFD': '\uFFFD',
+                'a\uFFFD': 'a\uFFFD',
+                '\uFFFDb': '\uFFFDb',
+                'a\uFFFDb': 'a\uFFFDb',
+
+                '\x01\x02\x1E\x1F': '\\1 \\2 \\1e \\1f ',
+
+                '0a': '\\30 a',
+                '1a': '\\31 a',
+                '2a': '\\32 a',
+                '3a': '\\33 a',
+                '4a': '\\34 a',
+                '5a': '\\35 a',
+                '6a': '\\36 a',
+                '7a': '\\37 a',
+                '8a': '\\38 a',
+                '9a': '\\39 a',
+
+                'a0b': 'a0b',
+                'a1b': 'a1b',
+                'a2b': 'a2b',
+                'a3b': 'a3b',
+                'a4b': 'a4b',
+                'a5b': 'a5b',
+                'a6b': 'a6b',
+                'a7b': 'a7b',
+                'a8b': 'a8b',
+                'a9b': 'a9b',
+
+                '-0a': '-\\30 a',
+                '-1a': '-\\31 a',
+                '-2a': '-\\32 a',
+                '-3a': '-\\33 a',
+                '-4a': '-\\34 a',
+                '-5a': '-\\35 a',
+                '-6a': '-\\36 a',
+                '-7a': '-\\37 a',
+                '-8a': '-\\38 a',
+                '-9a': '-\\39 a',
+
+                '-': '\\-',
+                '-a': '-a',
+                '--': '--',
+                '--a': '--a',
+
+                '\x80\x2D\x5F\xA9': '\x80\x2D\x5F\xA9',
+                '\x7F\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8A\x8B\x8C\x8D\x8E\x8F\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9A\x9B\x9C\x9D\x9E\x9F': '\\7f \x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8A\x8B\x8C\x8D\x8E\x8F\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9A\x9B\x9C\x9D\x9E\x9F',
+                '\xA0\xA1\xA2': '\xA0\xA1\xA2',
+                'a0123456789b': 'a0123456789b',
+                'abcdefghijklmnopqrstuvwxyz': 'abcdefghijklmnopqrstuvwxyz',
+                'ABCDEFGHIJKLMNOPQRSTUVWXYZ': 'ABCDEFGHIJKLMNOPQRSTUVWXYZ',
+
+                '\x20\x21\x78\x79': '\\ \\!xy',
+
+                // astral symbol (U+1D306 TETRAGRAM FOR CENTRE)
+                '\uD834\uDF06': '\uD834\uDF06',
+                // lone surrogates
+                '\uDF06': '\uDF06',
+                '\uD834': '\uD834'
+            };
+
+            forEachTest(tests, ident.encode);
+        });
+    });
 });