core/lexer/lexer.ts

import dbg from "../../debug/debug.js";
import type { Token } from "../../ast/mod.ts";

const debug = dbg("lex");

/**
 * Convert a CSS string into an array of lexical tokens.
 *
 * @param {String} css CSS
 * @returns {Array} lexical tokens
 */
export function lex(css: string): Token[] {
  let start = 0; // Debug timer start.

  let buffer = ""; // Character accumulator
  let ch: string; // Current character
  let column = 0; // Current source column number
  let cursor = -1; // Current source cursor position
  let depth = 0; // Current nesting depth
  let line = 1; // Current source line number
  let state = "before-selector"; // Current state
  const stack = [state]; // State stack
  let token: Token = {}; // Current token
  const tokens: Token[] = []; // Token accumulator

  // Supported @-rules, in roughly descending order of usage probability.
  const atRules: any = [
    "media",
    "keyframes",
    { name: "-webkit-keyframes", type: "keyframes", prefix: "-webkit-" },
    { name: "-moz-keyframes", type: "keyframes", prefix: "-moz-" },
    { name: "-ms-keyframes", type: "keyframes", prefix: "-ms-" },
    { name: "-o-keyframes", type: "keyframes", prefix: "-o-" },
    "font-face",
    { name: "import", state: "before-at-value" },
    { name: "charset", state: "before-at-value" },
    "supports",
    "viewport",
    { name: "namespace", state: "before-at-value" },
    "document",
    { name: "-moz-document", type: "document", prefix: "-moz-" },
    "page",
  ];

  // -- Functions ------------------------------------------------------------

  /**
   * Advance the character cursor and return the next character.
   *
   * @returns {String} The next character.
   */
  function getCh(): string {
    skip();
    return css[cursor];
  }

  /**
   * Return the state at the given index in the stack.
   * The stack is LIFO so indexing is from the right.
   *
   * @param {Number} [index=0] Index to return.
   * @returns {String} state
   */
  function getState(index?: number): string {
    return index ? stack[stack.length - 1 - index] : state;
  }

  /**
   * Look ahead for a string beginning from the next position. The string
   * being looked for must start at the next position.
   *
   * @param {String} str The string to look for.
   * @returns {Boolean} Whether the string was found.
   */
  function isNextString(str: string): boolean {
    let start = cursor + 1;
    return str === css.slice(start, start + str.length);
  }

  /**
   * Find the start position of a substring beginning from the next
   * position. The string being looked for may begin anywhere.
   *
   * @param {String} str The substring to look for.
   * @returns {Number|false} The position, or `false` if not found.
   */
  function find(str: string): number | boolean {
    let pos = css.slice(cursor).indexOf(str);

    return pos > 0 ? pos : false;
  }

  /**
   * Determine whether a character is next.
   *
   * @param {String} ch Character.
   * @returns {Boolean} Whether the character is next.
   */
  function isNextChar(ch: string): boolean {
    return ch === peek(1);
  }

  /**
   * Return the character at the given cursor offset. The offset is relative
   * to the cursor, so negative values move backwards.
   *
   * @param {Number} [offset=1] Cursor offset.
   * @returns {String} Character.
   */
  function peek(offset: number): string {
    return css[cursor + (offset || 1)];
  }

  /**
   * Remove the current state from the stack and set the new current state.
   *
   * @returns {String} The removed state.
   */
  function popState(): string | undefined {
    let removed = stack.pop();
    state = stack[stack.length - 1];

    return removed;
  }

  /**
   * Set the current state and add it to the stack.
   *
   * @param {String} newState The new state.
   * @returns {Number} The new stack length.
   */
  function pushState(newState: string): number {
    state = newState;
    stack.push(state);

    return stack.length;
  }

  /**
   * Replace the current state with a new state.
   *
   * @param {String} newState The new state.
   * @returns {String} The replaced state.
   */
  function replaceState(newState: string): string {
    let previousState = state;
    stack[stack.length - 1] = state = newState;

    return previousState;
  }

  /**
   * Move the character cursor. Positive numbers move the cursor forward.
   * Negative numbers are not supported!
   *
   * @param {Number} [n=1] Number of characters to skip.
   */
  function skip(n?: number) {
    if ((n || 1) == 1) {
      if (css[cursor] == "\n") {
        line++;
        column = 1;
      } else {
        column++;
      }
      cursor++;
    } else {
      let skipStr = css.slice(cursor, cursor + (n || 0)).split("\n");
      if (skipStr.length > 1) {
        line += skipStr.length - 1;
        column = 1;
      }
      column += skipStr[skipStr.length - 1].length;
      cursor = cursor + (n || 0);
    }
  }

  /**
   * Add the current token to the pile and reset the buffer.
   */
  function addToken() {
    token.end = {
      line: line,
      col: column,
    };

    debug("addToken:", JSON.stringify(token, null, 2));

    tokens.push(token);

    buffer = "";
    token = {};
  }

  /**
   * Set the current token.
   *
   * @param {String} type Token type.
   */
  function initializeToken(type: string) {
    token = {
      type: type,
      start: {
        line: line,
        col: column,
      },
    };
  }

  // -- Main Loop ------------------------------------------------------------

  /*
  The main loop is a state machine that reads in one character at a time,
  and determines what to do based on the current state and character.
  This is implemented as a series of nested `switch` statements and the
  case orders have been mildly optimized based on rough probabilities
  calculated by processing a small sample of real-world CSS.
  Further optimization (such as a dispatch table) shouldn't be necessary
  since the total number of cases is very low.
  */

  start = Date.now();

  while ((ch = getCh())) {
    debug(ch, getState());

    // column += 1;

    switch (ch) {
      // Space
      case " ":
        switch (getState()) {
          case "selector":
          case "value":
          case "value-paren":
          case "at-group":
          case "at-value":
          case "comment":
          case "double-string":
          case "single-string":
            buffer += ch;
            break;
        }
        break;

      // Newline or tab

      case "\n":
      case "\t":
      case "\r":
      case "\f":
        switch (getState()) {
          case "value":
          case "value-paren":
          case "at-group":
          case "comment":
          case "single-string":
          case "double-string":
          case "selector":
            buffer += ch;
            break;

          case "at-value":
            // Tokenize an @-rule if a semi-colon was omitted.
            if ("\n" === ch) {
              token.value = buffer.trim();
              addToken();
              popState();
            }
            break;
        }

        // if ('\n' === ch) {
        //   column = 0;
        //   line += 1;
        // }
        break;

      case ":":
        switch (getState()) {
          case "name":
            token.name = buffer.trim();
            buffer = "";

            replaceState("before-value");
            break;

          case "before-selector":
            buffer += ch;

            initializeToken("selector");
            pushState("selector");
            break;

          case "before-value":
            replaceState("value");
            buffer += ch;
            break;

          default:
            buffer += ch;
            break;
        }
        break;

      case ";":
        switch (getState()) {
          case "name":
          case "before-value":
          case "value":
            // Tokenize a declaration
            // if value is empty skip the declaration
            if (buffer.trim().length > 0) {
              (token.value = buffer.trim()), addToken();
            }
            replaceState("before-name");
            break;

          case "value-paren":
            // Insignificant semi-colon
            buffer += ch;
            break;

          case "at-value":
            // Tokenize an @-rule
            token.value = buffer.trim();
            addToken();
            popState();
            break;

          case "before-name":
            // Extraneous semi-colon
            break;

          default:
            buffer += ch;
            break;
        }
        break;

      case "{":
        switch (getState()) {
          case "selector":
            // If the sequence is `\{` then assume that the brace should be escaped.
            if (peek(-1) === "\\") {
              buffer += ch;
              break;
            }

            // Tokenize a selector
            token.text = buffer.trim();
            addToken();
            replaceState("before-name");
            depth = depth + 1;
            break;

          case "at-group":
            // Tokenize an @-group
            token.name = buffer.trim();
            switch (token.type) {
              case "font-face":
              case "viewport":
              case "page":
                pushState("before-name");
                break;

              default:
                pushState("before-selector");
            }

            addToken();
            depth = depth + 1;
            break;

          case "name":
          case "at-rule":
            // Tokenize a declaration or an @-rule
            token.name = buffer.trim();
            addToken();
            pushState("before-name");
            depth = depth + 1;
            break;

          case "comment":
          case "double-string":
          case "single-string":
            // Ignore braces in comments and strings
            buffer += ch;
            break;
          case "before-value":
            replaceState("value");
            buffer += ch;
            break;
        }

        break;

      case "}":
        switch (getState()) {
          case "before-name":
          case "name":
          case "before-value":
          case "value":
            // If the buffer contains anything, it is a value
            if (buffer) {
              token.value = buffer.trim();
            }

            // If the current token has a name and a value it should be tokenized.
            if (token.name && token.value) {
              addToken();
            }

            // Leave the block
            initializeToken("end");
            addToken();
            popState();

            // We might need to leave again.
            // XXX: What about 3 levels deep?
            if ("at-group" === getState()) {
              initializeToken("at-group-end");
              addToken();
              popState();
            }

            if (depth > 0) {
              depth = depth - 1;
            }

            break;

          case "at-group":
          case "before-selector":
          case "selector":
            // If the sequence is `\}` then assume that the brace should be escaped.
            if (peek(-1) === "\\") {
              buffer += ch;
              break;
            }

            if (depth > 0) {
              // Leave block if in an at-group
              if ("at-group" === getState(1)) {
                initializeToken("at-group-end");
                addToken();
              }
            }

            if (depth > 1) {
              popState();
            }

            if (depth > 0) {
              depth = depth - 1;
            }
            break;

          case "double-string":
          case "single-string":
          case "comment":
            // Ignore braces in comments and strings.
            buffer += ch;
            break;
        }

        break;

      // Strings

      case '"':
      case "'":
        switch (getState()) {
          case "double-string":
            if ('"' === ch && "\\" !== peek(-1)) {
              popState();
            }
            break;

          case "single-string":
            if ("'" === ch && "\\" !== peek(-1)) {
              popState();
            }
            break;

          case "before-at-value":
            replaceState("at-value");
            pushState('"' === ch ? "double-string" : "single-string");
            break;

          case "before-value":
            replaceState("value");
            pushState('"' === ch ? "double-string" : "single-string");
            break;

          case "comment":
            // Ignore strings within comments.
            break;

          default:
            if ("\\" !== peek(-1)) {
              pushState('"' === ch ? "double-string" : "single-string");
            }
        }

        buffer += ch;
        break;

      // Comments

      case "/":
        switch (getState()) {
          case "comment":
          case "double-string":
          case "single-string":
            // Ignore
            buffer += ch;
            break;

          case "before-value":
          case "selector":
          case "name":
          case "value":
            if (isNextChar("*")) {
              // Ignore comments in selectors, properties and values. They are
              // difficult to represent in the AST.
              let pos = find("*/");

              if (pos && typeof pos !== "boolean") {
                skip(pos + 1);
              }
            } else {
              if (getState() == "before-value") replaceState("value");
              buffer += ch;
            }
            break;

          default:
            if (isNextChar("*")) {
              // Create a comment token
              initializeToken("comment");
              pushState("comment");
              skip();
            } else {
              buffer += ch;
            }
            break;
        }
        break;

      // Comment end or universal selector

      case "*":
        switch (getState()) {
          case "comment":
            if (isNextChar("/")) {
              // Tokenize a comment
              token.text = buffer; // Don't trim()!
              skip();
              addToken();
              popState();
            } else {
              buffer += ch;
            }
            break;

          case "before-selector":
            buffer += ch;
            initializeToken("selector");
            pushState("selector");
            break;

          case "before-value":
            replaceState("value");
            buffer += ch;
            break;

          default:
            buffer += ch;
        }
        break;

      // @-rules

      case "@":
        switch (getState()) {
          case "comment":
          case "double-string":
          case "single-string":
            buffer += ch;
            break;
          case "before-value":
            replaceState("value");
            buffer += ch;
            break;

          default:
            // Iterate over the supported @-rules and attempt to tokenize one.
            let tokenized = false;
            let name;
            let rule;

            for (let j = 0, len = atRules.length; !tokenized && j < len; ++j) {
              rule = atRules[j];
              name = rule.name || rule;

              if (!isNextString(name)) continue;

              tokenized = true;

              initializeToken(name);
              pushState(rule.state || "at-group");
              skip(name.length);

              if (rule.prefix) {
                token.prefix = rule.prefix;
              }

              if (rule.type) {
                token.type = rule.type;
              }
            }

            if (!tokenized) {
              buffer += ch;
            }
            break;
        }
        break;

      // Parentheses are tracked to disambiguate semi-colons, such as within a
      // data URI.

      case "(":
        switch (getState()) {
          case "value":
            pushState("value-paren");
            break;
          case "before-value":
            replaceState("value");
            break;
        }

        buffer += ch;
        break;

      case ")":
        switch (getState()) {
          case "value-paren":
            popState();
            break;
          case "before-value":
            replaceState("value");
            break;
        }

        buffer += ch;
        break;

      default:
        switch (getState()) {
          case "before-selector":
            initializeToken("selector");
            pushState("selector");
            break;

          case "before-name":
            initializeToken("property");
            replaceState("name");
            break;

          case "before-value":
            replaceState("value");
            break;

          case "before-at-value":
            replaceState("at-value");
            break;
        }

        buffer += ch;
        break;
    }
  }

  debug("ran in", Date.now() - start + "ms");

  return tokens;
}