diff --git a/examples/cli.php b/examples/cli.php index d20bbdf..0d6f433 100644 --- a/examples/cli.php +++ b/examples/cli.php @@ -26,5 +26,6 @@ require_once __DIR__ . '/../vendor/autoload.php'; use Doctrine\SqlFormatter\SqlFormatter; +use Doctrine\SqlFormatter\Tokenizer; echo (new SqlFormatter())->format($sql); diff --git a/src/CliHighlighter.php b/src/CliHighlighter.php index 5cc0242..6c8e515 100644 --- a/src/CliHighlighter.php +++ b/src/CliHighlighter.php @@ -38,7 +38,7 @@ final class CliHighlighter implements Highlighter public function highlightToken(int $type, string $value) : string { - if ($type === SqlFormatter::TOKEN_TYPE_BOUNDARY && ($value==='(' || $value===')')) { + if ($type === Token::TOKEN_TYPE_BOUNDARY && ($value==='(' || $value===')')) { return $value; } @@ -53,24 +53,24 @@ public function highlightToken(int $type, string $value) : string private function prefix(int $type) : ?string { switch ($type) { - case SqlFormatter::TOKEN_TYPE_BOUNDARY: + case Token::TOKEN_TYPE_BOUNDARY: return $this->cliBoundary; - case SqlFormatter::TOKEN_TYPE_WORD: + case Token::TOKEN_TYPE_WORD: return $this->cliWord; - case SqlFormatter::TOKEN_TYPE_BACKTICK_QUOTE: + case Token::TOKEN_TYPE_BACKTICK_QUOTE: return $this->cliBacktickQuote; - case SqlFormatter::TOKEN_TYPE_QUOTE: + case Token::TOKEN_TYPE_QUOTE: return $this->cliQuote; - case SqlFormatter::TOKEN_TYPE_RESERVED: - case SqlFormatter::TOKEN_TYPE_RESERVED_TOPLEVEL: - case SqlFormatter::TOKEN_TYPE_RESERVED_NEWLINE: + case Token::TOKEN_TYPE_RESERVED: + case Token::TOKEN_TYPE_RESERVED_TOPLEVEL: + case Token::TOKEN_TYPE_RESERVED_NEWLINE: return $this->cliReserved; - case SqlFormatter::TOKEN_TYPE_NUMBER: + case Token::TOKEN_TYPE_NUMBER: return $this->cliNumber; - case SqlFormatter::TOKEN_TYPE_VARIABLE: + case Token::TOKEN_TYPE_VARIABLE: return $this->cliVariable; - case SqlFormatter::TOKEN_TYPE_COMMENT: - case SqlFormatter::TOKEN_TYPE_BLOCK_COMMENT: + case Token::TOKEN_TYPE_COMMENT: + case Token::TOKEN_TYPE_BLOCK_COMMENT: return $this->cliComment; default: return null; diff --git a/src/HtmlHighlighter.php b/src/HtmlHighlighter.php index dd5ed01..55d56d3 100644 --- a/src/HtmlHighlighter.php +++ b/src/HtmlHighlighter.php @@ -53,7 +53,7 @@ public function highlightToken(int $type, string $value) : string { $value = htmlentities($value, ENT_COMPAT | ENT_IGNORE, 'UTF-8'); - if ($type === SqlFormatter::TOKEN_TYPE_BOUNDARY && ($value==='(' || $value===')')) { + if ($type === Token::TOKEN_TYPE_BOUNDARY && ($value==='(' || $value===')')) { return $value; } @@ -68,24 +68,24 @@ public function highlightToken(int $type, string $value) : string public function attributes(int $type) : ?string { switch ($type) { - case SqlFormatter::TOKEN_TYPE_BOUNDARY: + case Token::TOKEN_TYPE_BOUNDARY: return $this->boundaryAttributes; - case SqlFormatter::TOKEN_TYPE_WORD: + case Token::TOKEN_TYPE_WORD: return $this->wordAttributes; - case SqlFormatter::TOKEN_TYPE_BACKTICK_QUOTE: + case Token::TOKEN_TYPE_BACKTICK_QUOTE: return $this->backtickQuoteAttributes; - case SqlFormatter::TOKEN_TYPE_QUOTE: + case Token::TOKEN_TYPE_QUOTE: return $this->quoteAttributes; - case SqlFormatter::TOKEN_TYPE_RESERVED: - case SqlFormatter::TOKEN_TYPE_RESERVED_TOPLEVEL: - case SqlFormatter::TOKEN_TYPE_RESERVED_NEWLINE: + case Token::TOKEN_TYPE_RESERVED: + case Token::TOKEN_TYPE_RESERVED_TOPLEVEL: + case Token::TOKEN_TYPE_RESERVED_NEWLINE: return $this->reservedAttributes; - case SqlFormatter::TOKEN_TYPE_NUMBER: + case Token::TOKEN_TYPE_NUMBER: return $this->numberAttributes; - case SqlFormatter::TOKEN_TYPE_VARIABLE: + case Token::TOKEN_TYPE_VARIABLE: return $this->variableAttributes; - case SqlFormatter::TOKEN_TYPE_COMMENT: - case SqlFormatter::TOKEN_TYPE_BLOCK_COMMENT: + case Token::TOKEN_TYPE_COMMENT: + case Token::TOKEN_TYPE_BLOCK_COMMENT: return $this->commentAttributes; default: return null; diff --git a/src/SqlFormatter.php b/src/SqlFormatter.php index 209c38a..d083f1e 100644 --- a/src/SqlFormatter.php +++ b/src/SqlFormatter.php @@ -11,701 +11,22 @@ namespace Doctrine\SqlFormatter; -use function array_combine; -use function array_keys; -use function array_map; use function array_search; use function array_shift; use function array_unshift; -use function arsort; use function assert; -use function count; use function current; -use function implode; -use function preg_match; -use function preg_quote; use function preg_replace; use function reset; use function rtrim; -use function serialize; use function str_repeat; use function str_replace; use function strlen; -use function strpos; -use function strtoupper; -use function substr; use function trim; use const PHP_SAPI; final class SqlFormatter { - // Constants for token types - public const TOKEN_TYPE_WHITESPACE = 0; - public const TOKEN_TYPE_WORD = 1; - public const TOKEN_TYPE_QUOTE = 2; - public const TOKEN_TYPE_BACKTICK_QUOTE = 3; - public const TOKEN_TYPE_RESERVED = 4; - public const TOKEN_TYPE_RESERVED_TOPLEVEL = 5; - public const TOKEN_TYPE_RESERVED_NEWLINE = 6; - public const TOKEN_TYPE_BOUNDARY = 7; - public const TOKEN_TYPE_COMMENT = 8; - public const TOKEN_TYPE_BLOCK_COMMENT = 9; - public const TOKEN_TYPE_NUMBER = 10; - public const TOKEN_TYPE_ERROR = 11; - public const TOKEN_TYPE_VARIABLE = 12; - - // Constants for different components of a token - public const TOKEN_TYPE = 0; - public const TOKEN_VALUE = 1; - - /** - * Reserved words (for syntax highlighting) - * - * @var string[] - */ - private $reserved = [ - 'ACCESSIBLE', - 'ACTION', - 'AGAINST', - 'AGGREGATE', - 'ALGORITHM', - 'ALL', - 'ALTER', - 'ANALYSE', - 'ANALYZE', - 'AS', - 'ASC', - 'AUTOCOMMIT', - 'AUTO_INCREMENT', - 'BACKUP', - 'BEGIN', - 'BETWEEN', - 'BINLOG', - 'BOTH', - 'CASCADE', - 'CASE', - 'CHANGE', - 'CHANGED', - 'CHARACTER SET', - 'CHARSET', - 'CHECK', - 'CHECKSUM', - 'COLLATE', - 'COLLATION', - 'COLUMN', - 'COLUMNS', - 'COMMENT', - 'COMMIT', - 'COMMITTED', - 'COMPRESSED', - 'CONCURRENT', - 'CONSTRAINT', - 'CONTAINS', - 'CONVERT', - 'CREATE', - 'CROSS', - 'CURRENT_TIMESTAMP', - 'DATABASE', - 'DATABASES', - 'DAY', - 'DAY_HOUR', - 'DAY_MINUTE', - 'DAY_SECOND', - 'DEFAULT', - 'DEFINER', - 'DELAYED', - 'DELETE', - 'DESC', - 'DESCRIBE', - 'DETERMINISTIC', - 'DISTINCT', - 'DISTINCTROW', - 'DIV', - 'DO', - 'DUMPFILE', - 'DUPLICATE', - 'DYNAMIC', - 'ELSE', - 'ENCLOSED', - 'END', - 'ENGINE', - 'ENGINE_TYPE', - 'ENGINES', - 'ESCAPE', - 'ESCAPED', - 'EVENTS', - 'EXEC', - 'EXECUTE', - 'EXISTS', - 'EXPLAIN', - 'EXTENDED', - 'FAST', - 'FIELDS', - 'FILE', - 'FIRST', - 'FIXED', - 'FLUSH', - 'FOR', - 'FORCE', - 'FOREIGN', - 'FULL', - 'FULLTEXT', - 'FUNCTION', - 'GLOBAL', - 'GRANT', - 'GRANTS', - 'GROUP_CONCAT', - 'HEAP', - 'HIGH_PRIORITY', - 'HOSTS', - 'HOUR', - 'HOUR_MINUTE', - 'HOUR_SECOND', - 'IDENTIFIED', - 'IF', - 'IFNULL', - 'IGNORE', - 'IN', - 'INDEX', - 'INDEXES', - 'INFILE', - 'INSERT', - 'INSERT_ID', - 'INSERT_METHOD', - 'INTERVAL', - 'INTO', - 'INVOKER', - 'IS', - 'ISOLATION', - 'KEY', - 'KEYS', - 'KILL', - 'LAST_INSERT_ID', - 'LEADING', - 'LEVEL', - 'LIKE', - 'LINEAR', - 'LINES', - 'LOAD', - 'LOCAL', - 'LOCK', - 'LOCKS', - 'LOGS', - 'LOW_PRIORITY', - 'MARIA', - 'MASTER', - 'MASTER_CONNECT_RETRY', - 'MASTER_HOST', - 'MASTER_LOG_FILE', - 'MATCH', - 'MAX_CONNECTIONS_PER_HOUR', - 'MAX_QUERIES_PER_HOUR', - 'MAX_ROWS', - 'MAX_UPDATES_PER_HOUR', - 'MAX_USER_CONNECTIONS', - 'MEDIUM', - 'MERGE', - 'MINUTE', - 'MINUTE_SECOND', - 'MIN_ROWS', - 'MODE', - 'MODIFY', - 'MONTH', - 'MRG_MYISAM', - 'MYISAM', - 'NAMES', - 'NATURAL', - 'NOT', - 'NOW()', - 'NULL', - 'OFFSET', - 'ON', - 'OPEN', - 'OPTIMIZE', - 'OPTION', - 'OPTIONALLY', - 'ON UPDATE', - 'ON DELETE', - 'OUTFILE', - 'PACK_KEYS', - 'PAGE', - 'PARTIAL', - 'PARTITION', - 'PARTITIONS', - 'PASSWORD', - 'PRIMARY', - 'PRIVILEGES', - 'PROCEDURE', - 'PROCESS', - 'PROCESSLIST', - 'PURGE', - 'QUICK', - 'RANGE', - 'RAID0', - 'RAID_CHUNKS', - 'RAID_CHUNKSIZE', - 'RAID_TYPE', - 'READ', - 'READ_ONLY', - 'READ_WRITE', - 'REFERENCES', - 'REGEXP', - 'RELOAD', - 'RENAME', - 'REPAIR', - 'REPEATABLE', - 'REPLACE', - 'REPLICATION', - 'RESET', - 'RESTORE', - 'RESTRICT', - 'RETURN', - 'RETURNS', - 'REVOKE', - 'RLIKE', - 'ROLLBACK', - 'ROW', - 'ROWS', - 'ROW_FORMAT', - 'SECOND', - 'SECURITY', - 'SEPARATOR', - 'SERIALIZABLE', - 'SESSION', - 'SHARE', - 'SHOW', - 'SHUTDOWN', - 'SLAVE', - 'SONAME', - 'SOUNDS', - 'SQL', - 'SQL_AUTO_IS_NULL', - 'SQL_BIG_RESULT', - 'SQL_BIG_SELECTS', - 'SQL_BIG_TABLES', - 'SQL_BUFFER_RESULT', - 'SQL_CALC_FOUND_ROWS', - 'SQL_LOG_BIN', - 'SQL_LOG_OFF', - 'SQL_LOG_UPDATE', - 'SQL_LOW_PRIORITY_UPDATES', - 'SQL_MAX_JOIN_SIZE', - 'SQL_QUOTE_SHOW_CREATE', - 'SQL_SAFE_UPDATES', - 'SQL_SELECT_LIMIT', - 'SQL_SLAVE_SKIP_COUNTER', - 'SQL_SMALL_RESULT', - 'SQL_WARNINGS', - 'SQL_CACHE', - 'SQL_NO_CACHE', - 'START', - 'STARTING', - 'STATUS', - 'STOP', - 'STORAGE', - 'STRAIGHT_JOIN', - 'STRING', - 'STRIPED', - 'SUPER', - 'TABLE', - 'TABLES', - 'TEMPORARY', - 'TERMINATED', - 'THEN', - 'TO', - 'TRAILING', - 'TRANSACTIONAL', - 'TRUE', - 'TRUNCATE', - 'TYPE', - 'TYPES', - 'UNCOMMITTED', - 'UNIQUE', - 'UNLOCK', - 'UNSIGNED', - 'USAGE', - 'USE', - 'USING', - 'VARIABLES', - 'VIEW', - 'WHEN', - 'WITH', - 'WORK', - 'WRITE', - 'YEAR_MONTH', - ]; - - /** - * For SQL formatting - * These keywords will all be on their own line - * - * @var string[] - */ - private $reservedToplevel = [ - 'SELECT', - 'FROM', - 'WHERE', - 'SET', - 'ORDER BY', - 'GROUP BY', - 'LIMIT', - 'DROP', - 'VALUES', - 'UPDATE', - 'HAVING', - 'ADD', - 'AFTER', - 'ALTER TABLE', - 'DELETE FROM', - 'UNION ALL', - 'UNION', - 'EXCEPT', - 'INTERSECT', - ]; - - /** @var string[] */ - private $reservedNewline = [ - 'LEFT OUTER JOIN', - 'RIGHT OUTER JOIN', - 'LEFT JOIN', - 'RIGHT JOIN', - 'OUTER JOIN', - 'INNER JOIN', - 'JOIN', - 'XOR', - 'OR', - 'AND', - ]; - - /** @var string[] */ - private $functions = [ - 'ABS', - 'ACOS', - 'ADDDATE', - 'ADDTIME', - 'AES_DECRYPT', - 'AES_ENCRYPT', - 'AREA', - 'ASBINARY', - 'ASCII', - 'ASIN', - 'ASTEXT', - 'ATAN', - 'ATAN2', - 'AVG', - 'BDMPOLYFROMTEXT', - 'BDMPOLYFROMWKB', - 'BDPOLYFROMTEXT', - 'BDPOLYFROMWKB', - 'BENCHMARK', - 'BIN', - 'BIT_AND', - 'BIT_COUNT', - 'BIT_LENGTH', - 'BIT_OR', - 'BIT_XOR', - 'BOUNDARY', - 'BUFFER', - 'CAST', - 'CEIL', - 'CEILING', - 'CENTROID', - 'CHAR', - 'CHARACTER_LENGTH', - 'CHARSET', - 'CHAR_LENGTH', - 'COALESCE', - 'COERCIBILITY', - 'COLLATION', - 'COMPRESS', - 'CONCAT', - 'CONCAT_WS', - 'CONNECTION_ID', - 'CONTAINS', - 'CONV', - 'CONVERT', - 'CONVERT_TZ', - 'CONVEXHULL', - 'COS', - 'COT', - 'COUNT', - 'CRC32', - 'CROSSES', - 'CURDATE', - 'CURRENT_DATE', - 'CURRENT_TIME', - 'CURRENT_TIMESTAMP', - 'CURRENT_USER', - 'CURTIME', - 'DATABASE', - 'DATE', - 'DATEDIFF', - 'DATE_ADD', - 'DATE_DIFF', - 'DATE_FORMAT', - 'DATE_SUB', - 'DAY', - 'DAYNAME', - 'DAYOFMONTH', - 'DAYOFWEEK', - 'DAYOFYEAR', - 'DECODE', - 'DEFAULT', - 'DEGREES', - 'DES_DECRYPT', - 'DES_ENCRYPT', - 'DIFFERENCE', - 'DIMENSION', - 'DISJOINT', - 'DISTANCE', - 'ELT', - 'ENCODE', - 'ENCRYPT', - 'ENDPOINT', - 'ENVELOPE', - 'EQUALS', - 'EXP', - 'EXPORT_SET', - 'EXTERIORRING', - 'EXTRACT', - 'EXTRACTVALUE', - 'FIELD', - 'FIND_IN_SET', - 'FLOOR', - 'FORMAT', - 'FOUND_ROWS', - 'FROM_DAYS', - 'FROM_UNIXTIME', - 'GEOMCOLLFROMTEXT', - 'GEOMCOLLFROMWKB', - 'GEOMETRYCOLLECTION', - 'GEOMETRYCOLLECTIONFROMTEXT', - 'GEOMETRYCOLLECTIONFROMWKB', - 'GEOMETRYFROMTEXT', - 'GEOMETRYFROMWKB', - 'GEOMETRYN', - 'GEOMETRYTYPE', - 'GEOMFROMTEXT', - 'GEOMFROMWKB', - 'GET_FORMAT', - 'GET_LOCK', - 'GLENGTH', - 'GREATEST', - 'GROUP_CONCAT', - 'GROUP_UNIQUE_USERS', - 'HEX', - 'HOUR', - 'IF', - 'IFNULL', - 'INET_ATON', - 'INET_NTOA', - 'INSERT', - 'INSTR', - 'INTERIORRINGN', - 'INTERSECTION', - 'INTERSECTS', - 'INTERVAL', - 'ISCLOSED', - 'ISEMPTY', - 'ISNULL', - 'ISRING', - 'ISSIMPLE', - 'IS_FREE_LOCK', - 'IS_USED_LOCK', - 'LAST_DAY', - 'LAST_INSERT_ID', - 'LCASE', - 'LEAST', - 'LEFT', - 'LENGTH', - 'LINEFROMTEXT', - 'LINEFROMWKB', - 'LINESTRING', - 'LINESTRINGFROMTEXT', - 'LINESTRINGFROMWKB', - 'LN', - 'LOAD_FILE', - 'LOCALTIME', - 'LOCALTIMESTAMP', - 'LOCATE', - 'LOG', - 'LOG10', - 'LOG2', - 'LOWER', - 'LPAD', - 'LTRIM', - 'MAKEDATE', - 'MAKETIME', - 'MAKE_SET', - 'MASTER_POS_WAIT', - 'MAX', - 'MBRCONTAINS', - 'MBRDISJOINT', - 'MBREQUAL', - 'MBRINTERSECTS', - 'MBROVERLAPS', - 'MBRTOUCHES', - 'MBRWITHIN', - 'MD5', - 'MICROSECOND', - 'MID', - 'MIN', - 'MINUTE', - 'MLINEFROMTEXT', - 'MLINEFROMWKB', - 'MOD', - 'MONTH', - 'MONTHNAME', - 'MPOINTFROMTEXT', - 'MPOINTFROMWKB', - 'MPOLYFROMTEXT', - 'MPOLYFROMWKB', - 'MULTILINESTRING', - 'MULTILINESTRINGFROMTEXT', - 'MULTILINESTRINGFROMWKB', - 'MULTIPOINT', - 'MULTIPOINTFROMTEXT', - 'MULTIPOINTFROMWKB', - 'MULTIPOLYGON', - 'MULTIPOLYGONFROMTEXT', - 'MULTIPOLYGONFROMWKB', - 'NAME_CONST', - 'NULLIF', - 'NUMGEOMETRIES', - 'NUMINTERIORRINGS', - 'NUMPOINTS', - 'OCT', - 'OCTET_LENGTH', - 'OLD_PASSWORD', - 'ORD', - 'OVERLAPS', - 'PASSWORD', - 'PERIOD_ADD', - 'PERIOD_DIFF', - 'PI', - 'POINT', - 'POINTFROMTEXT', - 'POINTFROMWKB', - 'POINTN', - 'POINTONSURFACE', - 'POLYFROMTEXT', - 'POLYFROMWKB', - 'POLYGON', - 'POLYGONFROMTEXT', - 'POLYGONFROMWKB', - 'POSITION', - 'POW', - 'POWER', - 'QUARTER', - 'QUOTE', - 'RADIANS', - 'RAND', - 'RELATED', - 'RELEASE_LOCK', - 'REPEAT', - 'REPLACE', - 'REVERSE', - 'RIGHT', - 'ROUND', - 'ROW_COUNT', - 'RPAD', - 'RTRIM', - 'SCHEMA', - 'SECOND', - 'SEC_TO_TIME', - 'SESSION_USER', - 'SHA', - 'SHA1', - 'SIGN', - 'SIN', - 'SLEEP', - 'SOUNDEX', - 'SPACE', - 'SQRT', - 'SRID', - 'STARTPOINT', - 'STD', - 'STDDEV', - 'STDDEV_POP', - 'STDDEV_SAMP', - 'STRCMP', - 'STR_TO_DATE', - 'SUBDATE', - 'SUBSTR', - 'SUBSTRING', - 'SUBSTRING_INDEX', - 'SUBTIME', - 'SUM', - 'SYMDIFFERENCE', - 'SYSDATE', - 'SYSTEM_USER', - 'TAN', - 'TIME', - 'TIMEDIFF', - 'TIMESTAMP', - 'TIMESTAMPADD', - 'TIMESTAMPDIFF', - 'TIME_FORMAT', - 'TIME_TO_SEC', - 'TOUCHES', - 'TO_DAYS', - 'TRIM', - 'TRUNCATE', - 'UCASE', - 'UNCOMPRESS', - 'UNCOMPRESSED_LENGTH', - 'UNHEX', - 'UNIQUE_USERS', - 'UNIX_TIMESTAMP', - 'UPDATEXML', - 'UPPER', - 'USER', - 'UTC_DATE', - 'UTC_TIME', - 'UTC_TIMESTAMP', - 'UUID', - 'VARIANCE', - 'VAR_POP', - 'VAR_SAMP', - 'VERSION', - 'WEEK', - 'WEEKDAY', - 'WEEKOFYEAR', - 'WITHIN', - 'X', - 'Y', - 'YEAR', - 'YEARWEEK', - ]; - - /** - * Punctuation that can be used as a boundary between other tokens - * - * @var string[] - */ - private $boundaries = [ - ',', - ';', - ':', - ')', - '(', - '.', - '=', - '<', - '>', - '+', - '-', - '*', - '/', - '!', - '^', - '%', - '|', - '&', - '#', - ]; - - /** * Whether or not the current environment is the CLI * This affects the type of syntax highlighting @@ -722,343 +43,18 @@ final class SqlFormatter */ public $tab = ' '; - // Regular expressions for tokenizing - - /** @var string */ - private $regexBoundaries; - - /** @var string */ - private $regexReserved; - - /** @var string */ - private $regexReservedNewline; - - /** @var string */ - private $regexReservedToplevel; - - /** @var string */ - private $regexFunction; - - // Cache variables - // Only tokens shorter than this size will be cached. Somewhere between 10 - // and 20 seems to work well for most cases. - /** @var int */ - public $maxCachekeySize = 15; - - /** @var mixed[][] */ - private $tokenCache = []; - - /** @var int */ - private $cacheHits = 0; - - /** @var int */ - private $cacheMisses = 0; + /** @var Tokenizer */ + private $tokenizer; /** @var Highlighter */ private $highlighter; - public function __construct(?Highlighter $highlighter = null) - { - // Sort reserved word list from longest word to shortest, 3x faster than usort - $reservedMap = array_combine($this->reserved, array_map('strlen', $this->reserved)); - assert($reservedMap !== false); - arsort($reservedMap); - $this->reserved = array_keys($reservedMap); - - // Set up regular expressions - $this->regexBoundaries = '(' . implode( - '|', - $this->quoteRegex($this->boundaries) - ) . ')'; - $this->regexReserved = '(' . implode( - '|', - $this->quoteRegex($this->reserved) - ) . ')'; - $this->regexReservedToplevel = str_replace(' ', '\\s+', '(' . implode( - '|', - $this->quoteRegex($this->reservedToplevel) - ) . ')'); - $this->regexReservedNewline = str_replace(' ', '\\s+', '(' . implode( - '|', - $this->quoteRegex($this->reservedNewline) - ) . ')'); - - $this->regexFunction = '(' . implode('|', $this->quoteRegex($this->functions)) . ')'; - - if ($highlighter === null) { - $this->highlighter = $this->isCli() ? new CliHighlighter() : new HtmlHighlighter(); - - return; - } - - $this->highlighter = $highlighter; - } - - /** - * Get stats about the token cache - * - * @return mixed[] An array containing the keys 'hits', 'misses', 'entries', and 'size' in bytes - */ - public function getCacheStats() : array - { - return [ - 'hits'=>$this->cacheHits, - 'misses'=>$this->cacheMisses, - 'entries'=>count($this->tokenCache), - 'size'=>strlen(serialize($this->tokenCache)), - ]; - } - - /** - * Return the next token and token type in a SQL string. - * Quoted strings, comments, reserved words, whitespace, and punctuation are all their own tokens. - * - * @param string $string The SQL string - * @param mixed[]|null $previous The result of the previous getNextToken() call - * - * @return mixed[] An associative array containing the type and value of the token. - */ - private function getNextToken(string $string, ?array $previous = null) : array - { - // Whitespace - if (preg_match('/^\s+/', $string, $matches)) { - return [ - self::TOKEN_VALUE => $matches[0], - self::TOKEN_TYPE=>self::TOKEN_TYPE_WHITESPACE, - ]; - } - - // Comment - if ($string[0] === '#' || - (isset($string[1]) && ($string[0]==='-' && $string[1]==='-') || - ($string[0]==='/' && $string[1]==='*'))) { - // Comment until end of line - if ($string[0] === '-' || $string[0] === '#') { - $last = strpos($string, "\n"); - $type = self::TOKEN_TYPE_COMMENT; - } else { // Comment until closing comment tag - $last = strpos($string, '*/', 2) + 2; - $type = self::TOKEN_TYPE_BLOCK_COMMENT; - } - - if ($last === false) { - $last = strlen($string); - } - - return [ - self::TOKEN_VALUE => substr($string, 0, $last), - self::TOKEN_TYPE => $type, - ]; - } - - // Quoted String - if ($string[0]==='"' || $string[0]==='\'' || $string[0]==='`' || $string[0]==='[') { - return [ - self::TOKEN_TYPE => ( - $string[0]==='`' || $string[0]==='[' - ? self::TOKEN_TYPE_BACKTICK_QUOTE - : self::TOKEN_TYPE_QUOTE), - self::TOKEN_VALUE => $this->getQuotedString($string), - ]; - } - - // User-defined Variable - if (($string[0] === '@' || $string[0] === ':') && isset($string[1])) { - $ret = [ - self::TOKEN_VALUE => null, - self::TOKEN_TYPE => self::TOKEN_TYPE_VARIABLE, - ]; - - // If the variable name is quoted - if ($string[1]==='"' || $string[1]==='\'' || $string[1]==='`') { - $ret[self::TOKEN_VALUE] = $string[0] . $this->getQuotedString(substr($string, 1)); - } else { - // Non-quoted variable name - preg_match('/^(' . $string[0] . '[a-zA-Z0-9\._\$]+)/', $string, $matches); - if ($matches) { - $ret[self::TOKEN_VALUE] = $matches[1]; - } - } - - if ($ret[self::TOKEN_VALUE] !== null) { - return $ret; - } - } - - // Number (decimal, binary, or hex) - if (preg_match( - '/^([0-9]+(\.[0-9]+)?|0x[0-9a-fA-F]+|0b[01]+)($|\s|"\'`|' . $this->regexBoundaries . ')/', - $string, - $matches - )) { - return [ - self::TOKEN_VALUE => $matches[1], - self::TOKEN_TYPE=>self::TOKEN_TYPE_NUMBER, - ]; - } - - // Boundary Character (punctuation and symbols) - if (preg_match('/^(' . $this->regexBoundaries . ')/', $string, $matches)) { - return [ - self::TOKEN_VALUE => $matches[1], - self::TOKEN_TYPE => self::TOKEN_TYPE_BOUNDARY, - ]; - } - - // A reserved word cannot be preceded by a '.' - // this makes it so in "mytable.from", "from" is not considered a reserved word - if (! $previous || ! isset($previous[self::TOKEN_VALUE]) || $previous[self::TOKEN_VALUE] !== '.') { - $upper = strtoupper($string); - // Top Level Reserved Word - if (preg_match( - '/^(' . $this->regexReservedToplevel . ')($|\s|' . $this->regexBoundaries . ')/', - $upper, - $matches - )) { - return [ - self::TOKEN_TYPE=>self::TOKEN_TYPE_RESERVED_TOPLEVEL, - self::TOKEN_VALUE=>substr($string, 0, strlen($matches[1])), - ]; - } - - // Newline Reserved Word - if (preg_match( - '/^(' . $this->regexReservedNewline . ')($|\s|' . $this->regexBoundaries . ')/', - $upper, - $matches - )) { - return [ - self::TOKEN_TYPE=>self::TOKEN_TYPE_RESERVED_NEWLINE, - self::TOKEN_VALUE=>substr($string, 0, strlen($matches[1])), - ]; - } - - // Other Reserved Word - if (preg_match( - '/^(' . $this->regexReserved . ')($|\s|' . $this->regexBoundaries . ')/', - $upper, - $matches - )) { - return [ - self::TOKEN_TYPE=>self::TOKEN_TYPE_RESERVED, - self::TOKEN_VALUE=>substr($string, 0, strlen($matches[1])), - ]; - } - } - - // A function must be suceeded by '(' - // this makes it so "count(" is considered a function, but "count" alone is not - $upper = strtoupper($string); - // function - if (preg_match('/^(' . $this->regexFunction . '[(]|\s|[)])/', $upper, $matches)) { - return [ - self::TOKEN_TYPE=>self::TOKEN_TYPE_RESERVED, - self::TOKEN_VALUE=>substr($string, 0, strlen($matches[1])-1), - ]; - } - - // Non reserved word - preg_match('/^(.*?)($|\s|["\'`]|' . $this->regexBoundaries . ')/', $string, $matches); - - return [ - self::TOKEN_VALUE => $matches[1], - self::TOKEN_TYPE => self::TOKEN_TYPE_WORD, - ]; - } - - private function getQuotedString(string $string) : string - { - $ret = ''; - - // This checks for the following patterns: - // 1. backtick quoted string using `` to escape - // 2. square bracket quoted string (SQL Server) using ]] to escape - // 3. double quoted string using "" or \" to escape - // 4. single quoted string using '' or \' to escape - if (preg_match( - '/^(((`[^`]*($|`))+)| - ((\[[^\]]*($|\]))(\][^\]]*($|\]))*)| - (("[^"\\\\]*(?:\\\\.[^"\\\\]*)*("|$))+)| - ((\'[^\'\\\\]*(?:\\\\.[^\'\\\\]*)*(\'|$))+))/sx', - $string, - $matches - )) { - $ret = $matches[1]; - } - - return $ret; - } - - /** - * Takes a SQL string and breaks it into tokens. - * Each token is an associative array with type and value. - * - * @param string $string The SQL string - * - * @return mixed[][] An array of tokens. - */ - private function tokenize(string $string) : array - { - $tokens = []; - - // Used for debugging if there is an error while tokenizing the string - $originalLength = strlen($string); - - // Used to make sure the string keeps shrinking on each iteration - $oldStringLen = strlen($string) + 1; - - $token = null; - - $currentLength = strlen($string); - - // Keep processing the string until it is empty - while ($currentLength) { - // If the string stopped shrinking, there was a problem - if ($oldStringLen <= $currentLength) { - $tokens[] = [ - self::TOKEN_VALUE=>$string, - self::TOKEN_TYPE=>self::TOKEN_TYPE_ERROR, - ]; - - return $tokens; - } - - $oldStringLen = $currentLength; - - // Determine if we can use caching - if ($currentLength >= $this->maxCachekeySize) { - $cacheKey = substr($string, 0, $this->maxCachekeySize); - } else { - $cacheKey = false; - } - - // See if the token is already cached - if ($cacheKey && isset($this->tokenCache[$cacheKey])) { - // Retrieve from cache - $token = $this->tokenCache[$cacheKey]; - $tokenLength = strlen($token[self::TOKEN_VALUE]); - $this->cacheHits++; - } else { - // Get the next token and the token type - $token = $this->getNextToken($string, $token); - $tokenLength = strlen($token[self::TOKEN_VALUE]); - $this->cacheMisses++; - - // If the token is shorter than the max length, store it in cache - if ($cacheKey && $tokenLength < $this->maxCachekeySize) { - $this->tokenCache[$cacheKey] = $token; - } - } - - $tokens[] = $token; - - // Advance the string - $string = substr($string, $tokenLength); - - $currentLength -= $tokenLength; - } - - return $tokens; + public function __construct( + ?Tokenizer $tokenizer = null, + ?Highlighter $highlighter = null + ) { + $this->tokenizer = $tokenizer ?? new Tokenizer(); + $this->highlighter = $highlighter ?? ($this->isCli() ? new CliHighlighter() : new HtmlHighlighter()); } /** @@ -1089,29 +85,26 @@ public function format(string $string, bool $highlight = true) : string $clauseLimit = false; // Tokenize String - $originalTokens = $this->tokenize($string); + $originalTokens = $this->tokenizer->tokenize($string); // Remove existing whitespace - $tokens = []; + $indexedTokens = []; foreach ($originalTokens as $i => $token) { - if ($token[self::TOKEN_TYPE] === self::TOKEN_TYPE_WHITESPACE) { + if ($token->type() === Token::TOKEN_TYPE_WHITESPACE) { continue; } - $token['i'] = $i; - $tokens[] = $token; + $indexedTokens[] = ['originalIndex' => $i, 'token' => $token]; } // Format token by token - foreach ($tokens as $i => $token) { + foreach ($indexedTokens as $i => $indexedToken) { + $token = $indexedToken['token']; // Get highlighted token if doing syntax highlighting if ($highlight) { - $highlighted = $this->highlighter->highlightToken( - $token[self::TOKEN_TYPE], - $token[self::TOKEN_VALUE] - ); + $highlighted = $this->highlighter->highlightToken($token->type(), $token->value()); } else { // If returning raw text - $highlighted = $token[self::TOKEN_VALUE]; + $highlighted = $token->value(); } // If we are increasing the special indent level now @@ -1138,9 +131,8 @@ public function format(string $string, bool $highlight = true) : string } // Display comments directly where they appear in the source - if ($token[self::TOKEN_TYPE] === self::TOKEN_TYPE_COMMENT || - $token[self::TOKEN_TYPE] === self::TOKEN_TYPE_BLOCK_COMMENT) { - if ($token[self::TOKEN_TYPE] === self::TOKEN_TYPE_BLOCK_COMMENT) { + if ($token->isOfType(Token::TOKEN_TYPE_COMMENT, Token::TOKEN_TYPE_BLOCK_COMMENT)) { + if ($token->isOfType(Token::TOKEN_TYPE_BLOCK_COMMENT)) { $indent = str_repeat($tab, $indentLevel); $return .= "\n" . $indent; $highlighted = str_replace("\n", "\n" . $indent, $highlighted); @@ -1153,7 +145,7 @@ public function format(string $string, bool $highlight = true) : string if ($inlineParentheses) { // End of inline parentheses - if ($token[self::TOKEN_VALUE] === ')') { + if ($token->value() === ')') { $return = rtrim($return, ' '); if ($inlineIndented) { @@ -1168,32 +160,32 @@ public function format(string $string, bool $highlight = true) : string continue; } - if ($token[self::TOKEN_VALUE] === ',') { + if ($token->value() === ',') { if ($inlineCount >= 30) { $inlineCount = 0; $newline = true; } } - $inlineCount += strlen($token[self::TOKEN_VALUE]); + $inlineCount += strlen($token->value()); } // Opening parentheses increase the block indent level and start a new line - if ($token[self::TOKEN_VALUE] === '(') { + if ($token->value() === '(') { // First check if this should be an inline parentheses block // Examples are "NOW()", "COUNT(*)", "int(10)", key(`somecolumn`), DECIMAL(7,2) // Allow up to 3 non-whitespace tokens inside inline parentheses $length = 0; for ($j=1; $j<=250; $j++) { // Reached end of string - if (! isset($tokens[$i+$j])) { + if (! isset($indexedTokens[$i+$j])) { break; } - $next = $tokens[$i+$j]; + $next = $indexedTokens[$i+$j]['token']; // Reached closing parentheses, able to inline it - if ($next[self::TOKEN_VALUE] === ')') { + if ($next->value() === ')') { $inlineParentheses = true; $inlineCount = 0; $inlineIndented = false; @@ -1201,19 +193,21 @@ public function format(string $string, bool $highlight = true) : string } // Reached an invalid token for inline parentheses - if ($next[self::TOKEN_VALUE]===';' || $next[self::TOKEN_VALUE]==='(') { + if ($next->value()===';' || $next->value()==='(') { break; } // Reached an invalid token type for inline parentheses - if ($next[self::TOKEN_TYPE]===self::TOKEN_TYPE_RESERVED_TOPLEVEL || - $next[self::TOKEN_TYPE]===self::TOKEN_TYPE_RESERVED_NEWLINE || - $next[self::TOKEN_TYPE]===self::TOKEN_TYPE_COMMENT || - $next[self::TOKEN_TYPE]===self::TOKEN_TYPE_BLOCK_COMMENT) { + if ($next->isOfType( + Token::TOKEN_TYPE_RESERVED_TOPLEVEL, + Token::TOKEN_TYPE_RESERVED_NEWLINE, + Token::TOKEN_TYPE_COMMENT, + Token::TOKEN_TYPE_BLOCK_COMMENT + )) { break; } - $length += strlen($next[self::TOKEN_VALUE]); + $length += strlen($next->value()); } if ($inlineParentheses && $length > 30) { @@ -1223,8 +217,8 @@ public function format(string $string, bool $highlight = true) : string } // Take out the preceding space unless there was whitespace there in the original query - if (isset($originalTokens[$token['i']-1]) && - $originalTokens[$token['i']-1][self::TOKEN_TYPE] !== self::TOKEN_TYPE_WHITESPACE) { + if (isset($originalTokens[$indexedToken['originalIndex']-1]) && + ! $originalTokens[$indexedToken['originalIndex']-1]->isOfType(Token::TOKEN_TYPE_WHITESPACE)) { $return = rtrim($return, ' '); } @@ -1233,7 +227,7 @@ public function format(string $string, bool $highlight = true) : string // Add a newline after the parentheses $newline = true; } - } elseif ($token[self::TOKEN_VALUE] === ')') { + } elseif ($token->value() === ')') { // Closing parentheses decrease the block indent level // Remove whitespace before the closing parentheses $return = rtrim($return, ' '); @@ -1255,7 +249,7 @@ public function format(string $string, bool $highlight = true) : string if ($highlight) { $return .= "\n" . $this->highlighter->highlightError( - $token[self::TOKEN_VALUE] + $token->value() ); continue; } @@ -1265,7 +259,7 @@ public function format(string $string, bool $highlight = true) : string if (! $addedNewline) { $return .= "\n" . str_repeat($tab, $indentLevel); } - } elseif ($token[self::TOKEN_TYPE] === self::TOKEN_TYPE_RESERVED_TOPLEVEL) { + } elseif ($token->isOfType(Token::TOKEN_TYPE_RESERVED_TOPLEVEL)) { // Top level reserved words start a new line and increase the special indent level $increaseSpecialIndent = true; @@ -1286,24 +280,20 @@ public function format(string $string, bool $highlight = true) : string $return = rtrim($return, $tab) . str_repeat($tab, $indentLevel); } - // If the token may have extra whitespace - if (strpos($token[self::TOKEN_VALUE], ' ')!==false || - strpos($token[self::TOKEN_VALUE], "\n")!==false || - strpos($token[self::TOKEN_VALUE], "\t")!==false) { + if ($token->hasExtraWhitespace()) { $highlighted = preg_replace('/\s+/', ' ', $highlighted); } //if SQL 'LIMIT' clause, start variable to reset newline - if ($token[self::TOKEN_VALUE] === 'LIMIT' && ! $inlineParentheses) { + if ($token->value() === 'LIMIT' && ! $inlineParentheses) { $clauseLimit = true; } } elseif ($clauseLimit && - $token[self::TOKEN_VALUE] !== ',' && - $token[self::TOKEN_TYPE] !== self::TOKEN_TYPE_NUMBER && - $token[self::TOKEN_TYPE] !== self::TOKEN_TYPE_WHITESPACE) { + $token->value() !== ',' && + ! $token->isOfType(Token::TOKEN_TYPE_NUMBER, Token::TOKEN_TYPE_WHITESPACE)) { // Checks if we are out of the limit clause $clauseLimit = false; - } elseif ($token[self::TOKEN_VALUE] === ',' && ! $inlineParentheses) { + } elseif ($token->value() === ',' && ! $inlineParentheses) { // Commas start a new line (unless within inline parentheses or SQL 'LIMIT' clause) //If the previous TOKEN_VALUE is 'LIMIT', resets new line if ($clauseLimit === true) { @@ -1313,56 +303,56 @@ public function format(string $string, bool $highlight = true) : string // All other cases of commas $newline = true; } - } elseif ($token[self::TOKEN_TYPE] === self::TOKEN_TYPE_RESERVED_NEWLINE) { + } elseif ($token->isOfType(Token::TOKEN_TYPE_RESERVED_NEWLINE)) { // Newline reserved words start a new line // Add a newline before the reserved word (if not already added) if (! $addedNewline) { $return .= "\n" . str_repeat($tab, $indentLevel); } - // If the token may have extra whitespace - if (strpos($token[self::TOKEN_VALUE], ' ')!==false || - strpos($token[self::TOKEN_VALUE], "\n")!==false || - strpos($token[self::TOKEN_VALUE], "\t")!==false) { + if ($token->hasExtraWhitespace()) { $highlighted = preg_replace('/\s+/', ' ', $highlighted); } - } elseif ($token[self::TOKEN_TYPE] === self::TOKEN_TYPE_BOUNDARY) { + } elseif ($token->isOfType(Token::TOKEN_TYPE_BOUNDARY)) { // Multiple boundary characters in a row should not have spaces between them (not including parentheses) - if (isset($tokens[$i-1]) && $tokens[$i-1][self::TOKEN_TYPE] === self::TOKEN_TYPE_BOUNDARY) { - if (isset($originalTokens[$token['i']-1]) && - $originalTokens[$token['i']-1][self::TOKEN_TYPE] !== self::TOKEN_TYPE_WHITESPACE) { + if (isset($indexedTokens[$i-1]) && + $indexedTokens[$i-1]['token']->isOfType(Token::TOKEN_TYPE_BOUNDARY)) { + if (isset($originalTokens[$indexedToken['originalIndex']-1]) && + ! $originalTokens[$indexedToken['originalIndex']-1]->isOfType(Token::TOKEN_TYPE_WHITESPACE)) { $return = rtrim($return, ' '); } } } // If the token shouldn't have a space before it - if ($token[self::TOKEN_VALUE] === '.' || - $token[self::TOKEN_VALUE] === ',' || - $token[self::TOKEN_VALUE] === ';') { + if ($token->value() === '.' || + $token->value() === ',' || + $token->value() === ';') { $return = rtrim($return, ' '); } $return .= $highlighted . ' '; // If the token shouldn't have a space after it - if ($token[self::TOKEN_VALUE] === '(' || $token[self::TOKEN_VALUE] === '.') { + if ($token->value() === '(' || $token->value() === '.') { $return = rtrim($return, ' '); } // If this is the "-" of a negative number, it shouldn't have a space after it - if ($token[self::TOKEN_VALUE] !== '-' || - ! isset($tokens[$i+1]) || - $tokens[$i+1][self::TOKEN_TYPE] !== self::TOKEN_TYPE_NUMBER || - ! isset($tokens[$i-1])) { + if ($token->value() !== '-' || + ! isset($indexedTokens[$i+1]) || + ! $indexedTokens[$i+1]['token']->isOfType(Token::TOKEN_TYPE_NUMBER) || + ! isset($indexedTokens[$i-1])) { continue; } - $prev = $tokens[$i-1][self::TOKEN_TYPE]; - if ($prev === self::TOKEN_TYPE_QUOTE || - $prev === self::TOKEN_TYPE_BACKTICK_QUOTE || - $prev === self::TOKEN_TYPE_WORD || - $prev === self::TOKEN_TYPE_NUMBER) { + $prev = $indexedTokens[$i-1]['token']; + if ($prev->isOfType( + Token::TOKEN_TYPE_QUOTE, + Token::TOKEN_TYPE_BACKTICK_QUOTE, + Token::TOKEN_TYPE_WORD, + Token::TOKEN_TYPE_NUMBER + )) { continue; } @@ -1393,14 +383,14 @@ public function format(string $string, bool $highlight = true) : string */ public function highlight(string $string) : string { - $tokens = $this->tokenize($string); + $tokens = $this->tokenizer->tokenize($string); $return = ''; foreach ($tokens as $token) { $return .= $this->highlighter->highlightToken( - $token[self::TOKEN_TYPE], - $token[self::TOKEN_VALUE] + $token->type(), + $token->value() ); } @@ -1418,25 +408,28 @@ public function compress(string $string) : string { $result = ''; - $tokens = $this->tokenize($string); + $tokens = $this->tokenizer->tokenize($string); $whitespace = true; foreach ($tokens as $token) { // Skip comment tokens - if ($token[self::TOKEN_TYPE] === self::TOKEN_TYPE_COMMENT || - $token[self::TOKEN_TYPE] === self::TOKEN_TYPE_BLOCK_COMMENT) { + if ($token->isOfType(Token::TOKEN_TYPE_COMMENT, Token::TOKEN_TYPE_BLOCK_COMMENT)) { continue; } // Remove extra whitespace in reserved words (e.g "OUTER JOIN" becomes "OUTER JOIN") - if ($token[self::TOKEN_TYPE] === self::TOKEN_TYPE_RESERVED || - $token[self::TOKEN_TYPE] === self::TOKEN_TYPE_RESERVED_NEWLINE || - $token[self::TOKEN_TYPE] === self::TOKEN_TYPE_RESERVED_TOPLEVEL) { - $token[self::TOKEN_VALUE] = preg_replace('/\s+/', ' ', $token[self::TOKEN_VALUE]); + if ($token->isOfType( + Token::TOKEN_TYPE_RESERVED, + Token::TOKEN_TYPE_RESERVED_NEWLINE, + Token::TOKEN_TYPE_RESERVED_TOPLEVEL + )) { + $newValue = preg_replace('/\s+/', ' ', $token->value()); + assert($newValue !== null); + $token = $token->withValue($newValue); } - if ($token[self::TOKEN_TYPE] === self::TOKEN_TYPE_WHITESPACE) { + if ($token->isOfType(Token::TOKEN_TYPE_WHITESPACE)) { // If the last token was whitespace, don't add another one if ($whitespace) { continue; @@ -1444,31 +437,17 @@ public function compress(string $string) : string $whitespace = true; // Convert all whitespace to a single space - $token[self::TOKEN_VALUE] = ' '; + $token = $token->withValue(' '); } else { $whitespace = false; } - $result .= $token[self::TOKEN_VALUE]; + $result .= $token->value(); } return rtrim($result); } - /** - * Helper function for building regular expressions for reserved words and boundary characters - * - * @param string[] $strings The strings to be quoted - * - * @return string[] The quoted strings - */ - private function quoteRegex(array $strings) : array - { - return array_map(static function (string $string) : string { - return preg_quote($string, '/'); - }, $strings); - } - private function isCli() : bool { if (isset($this->cli)) { diff --git a/src/Token.php b/src/Token.php new file mode 100644 index 0000000..425a1f7 --- /dev/null +++ b/src/Token.php @@ -0,0 +1,69 @@ +type = $type; + $this->value = $value; + } + + public function value() : string + { + return $this->value; + } + + public function type() : int + { + return $this->type; + } + + public function isOfType(int ...$types) : bool + { + return in_array($this->type, $types, true); + } + + public function hasExtraWhitespace() : bool + { + return strpos($this->value(), ' ')!== false || + strpos($this->value(), "\n") !== false || + strpos($this->value(), "\t") !== false; + } + + public function withValue(string $value) : self + { + return new self($this->type(), $value); + } +} diff --git a/src/Tokenizer.php b/src/Tokenizer.php new file mode 100644 index 0000000..ad941c4 --- /dev/null +++ b/src/Tokenizer.php @@ -0,0 +1,997 @@ +', + '+', + '-', + '*', + '/', + '!', + '^', + '%', + '|', + '&', + '#', + ]; + + // Cache variables + // Only tokens shorter than this size will be cached. Somewhere between 10 + // and 20 seems to work well for most cases. + /** @var int */ + public $maxCachekeySize = 15; + + /** @var Token[] */ + private $tokenCache = []; + + /** @var int */ + private $cacheHits = 0; + + /** @var int */ + private $cacheMisses = 0; + + /** + * Stuff that only needs to be done once. Builds regular expressions and + * sorts the reserved words. + */ + public function __construct() + { + // Sort reserved word list from longest word to shortest, 3x faster than usort + $reservedMap = array_combine($this->reserved, array_map('strlen', $this->reserved)); + assert($reservedMap !== false); + arsort($reservedMap); + $this->reserved = array_keys($reservedMap); + + // Set up regular expressions + $this->regexBoundaries = '(' . implode( + '|', + $this->quoteRegex($this->boundaries) + ) . ')'; + $this->regexReserved = '(' . implode( + '|', + $this->quoteRegex($this->reserved) + ) . ')'; + $this->regexReservedToplevel = str_replace(' ', '\\s+', '(' . implode( + '|', + $this->quoteRegex($this->reservedToplevel) + ) . ')'); + $this->regexReservedNewline = str_replace(' ', '\\s+', '(' . implode( + '|', + $this->quoteRegex($this->reservedNewline) + ) . ')'); + + $this->regexFunction = '(' . implode('|', $this->quoteRegex($this->functions)) . ')'; + } + + /** + * Takes a SQL string and breaks it into tokens. + * Each token is an associative array with type and value. + * + * @param string $string The SQL string + * + * @return array An array of tokens. + */ + public function tokenize(string $string) : array + { + $tokens = []; + + // Used to make sure the string keeps shrinking on each iteration + $oldStringLen = strlen($string) + 1; + + $token = null; + + $currentLength = strlen($string); + + // Keep processing the string until it is empty + while ($currentLength) { + // If the string stopped shrinking, there was a problem + if ($oldStringLen <= $currentLength) { + $tokens[] = new Token(Token::TOKEN_TYPE_ERROR, $string); + + return $tokens; + } + + $oldStringLen = $currentLength; + + // Determine if we can use caching + if ($currentLength >= $this->maxCachekeySize) { + $cacheKey = substr($string, 0, $this->maxCachekeySize); + } else { + $cacheKey = false; + } + + // See if the token is already cached + if ($cacheKey && isset($this->tokenCache[$cacheKey])) { + // Retrieve from cache + $token = $this->tokenCache[$cacheKey]; + $tokenLength = strlen($token->value()); + $this->cacheHits++; + } else { + // Get the next token and the token type + $token = $this->getNextToken($string, $token); + $tokenLength = strlen($token->value()); + $this->cacheMisses++; + + // If the token is shorter than the max length, store it in cache + if ($cacheKey && $tokenLength < $this->maxCachekeySize) { + $this->tokenCache[$cacheKey] = $token; + } + } + + $tokens[] = $token; + + // Advance the string + $string = substr($string, $tokenLength); + + $currentLength -= $tokenLength; + } + + return $tokens; + } + + /** + * Return the next token and token type in a SQL string. + * Quoted strings, comments, reserved words, whitespace, and punctuation + * are all their own tokens. + * + * @param string $string The SQL string + * @param Token|null $previous The result of the previous getNextToken() call + * + * @return Token An associative array containing the type and value of the token. + */ + private function getNextToken(string $string, ?Token $previous = null) : Token + { + $matches = []; + // Whitespace + if (preg_match('/^\s+/', $string, $matches)) { + return new Token(Token::TOKEN_TYPE_WHITESPACE, $matches[0]); + } + + // Comment + if ($string[0] === '#' || + (isset($string[1]) && ($string[0]==='-' && $string[1]==='-') || + ($string[0]==='/' && $string[1]==='*'))) { + // Comment until end of line + if ($string[0] === '-' || $string[0] === '#') { + $last = strpos($string, "\n"); + $type = Token::TOKEN_TYPE_COMMENT; + } else { // Comment until closing comment tag + $pos = strpos($string, '*/', 2); + assert($pos !== false); + $last = $pos + 2; + $type = Token::TOKEN_TYPE_BLOCK_COMMENT; + } + + if ($last === false) { + $last = strlen($string); + } + + return new Token($type, substr($string, 0, $last)); + } + + // Quoted String + if ($string[0]==='"' || $string[0]==='\'' || $string[0]==='`' || $string[0]==='[') { + return new Token( + ($string[0]==='`' || $string[0]==='[' + ? Token::TOKEN_TYPE_BACKTICK_QUOTE + : Token::TOKEN_TYPE_QUOTE), + $this->getQuotedString($string) + ); + } + + // User-defined Variable + if (($string[0] === '@' || $string[0] === ':') && isset($string[1])) { + $value = null; + $type = Token::TOKEN_TYPE_VARIABLE; + + // If the variable name is quoted + if ($string[1]==='"' || $string[1]==='\'' || $string[1]==='`') { + $value = $string[0] . $this->getQuotedString(substr($string, 1)); + } else { + // Non-quoted variable name + preg_match('/^(' . $string[0] . '[a-zA-Z0-9\._\$]+)/', $string, $matches); + if ($matches) { + $value = $matches[1]; + } + } + + if ($value !== null) { + return new Token($type, $value); + } + } + + // Number (decimal, binary, or hex) + if (preg_match( + '/^([0-9]+(\.[0-9]+)?|0x[0-9a-fA-F]+|0b[01]+)($|\s|"\'`|' . $this->regexBoundaries . ')/', + $string, + $matches + )) { + return new Token(Token::TOKEN_TYPE_NUMBER, $matches[1]); + } + + // Boundary Character (punctuation and symbols) + if (preg_match('/^(' . $this->regexBoundaries . ')/', $string, $matches)) { + return new Token(Token::TOKEN_TYPE_BOUNDARY, $matches[1]); + } + + // A reserved word cannot be preceded by a '.' + // this makes it so in "mytable.from", "from" is not considered a reserved word + if (! $previous || $previous->value() !== '.') { + $upper = strtoupper($string); + // Top Level Reserved Word + if (preg_match( + '/^(' . $this->regexReservedToplevel . ')($|\s|' . $this->regexBoundaries . ')/', + $upper, + $matches + )) { + return new Token( + Token::TOKEN_TYPE_RESERVED_TOPLEVEL, + substr($string, 0, strlen($matches[1])) + ); + } + + // Newline Reserved Word + if (preg_match( + '/^(' . $this->regexReservedNewline . ')($|\s|' . $this->regexBoundaries . ')/', + $upper, + $matches + )) { + return new Token( + Token::TOKEN_TYPE_RESERVED_NEWLINE, + substr($string, 0, strlen($matches[1])) + ); + } + + // Other Reserved Word + if (preg_match( + '/^(' . $this->regexReserved . ')($|\s|' . $this->regexBoundaries . ')/', + $upper, + $matches + )) { + return new Token( + Token::TOKEN_TYPE_RESERVED, + substr($string, 0, strlen($matches[1])) + ); + } + } + + // A function must be suceeded by '(' + // this makes it so "count(" is considered a function, but "count" alone is not + $upper = strtoupper($string); + // function + if (preg_match('/^(' . $this->regexFunction . '[(]|\s|[)])/', $upper, $matches)) { + return new Token( + Token::TOKEN_TYPE_RESERVED, + substr($string, 0, strlen($matches[1])-1) + ); + } + + // Non reserved word + preg_match('/^(.*?)($|\s|["\'`]|' . $this->regexBoundaries . ')/', $string, $matches); + + return new Token(Token::TOKEN_TYPE_WORD, $matches[1]); + } + + /** + * Helper function for building regular expressions for reserved words and boundary characters + * + * @param string[] $strings The strings to be quoted + * + * @return string[] The quoted strings + */ + private function quoteRegex(array $strings) : array + { + return array_map(static function (string $string) : string { + return preg_quote($string, '/'); + }, $strings); + } + + /** + * Get stats about the token cache + * + * @return mixed[] An array containing the keys 'hits', 'misses', 'entries', and 'size' in bytes + */ + public function getCacheStats() : array + { + return [ + 'hits' => $this->cacheHits, + 'misses' => $this->cacheMisses, + 'entries' => count($this->tokenCache), + 'size' => strlen(serialize($this->tokenCache)), + ]; + } + + private function getQuotedString(string $string) : string + { + $ret = ''; + + // This checks for the following patterns: + // 1. backtick quoted string using `` to escape + // 2. square bracket quoted string (SQL Server) using ]] to escape + // 3. double quoted string using "" or \" to escape + // 4. single quoted string using '' or \' to escape + if (preg_match( + '/^(((`[^`]*($|`))+)| + ((\[[^\]]*($|\]))(\][^\]]*($|\]))*)| + (("[^"\\\\]*(?:\\\\.[^"\\\\]*)*("|$))+)| + ((\'[^\'\\\\]*(?:\\\\.[^\'\\\\]*)*(\'|$))+))/sx', + $string, + $matches + )) { + $ret = $matches[1]; + } + + return $ret; + } +} diff --git a/tests/SqlFormatterTest.php b/tests/SqlFormatterTest.php index 2f58294..53db8e6 100644 --- a/tests/SqlFormatterTest.php +++ b/tests/SqlFormatterTest.php @@ -7,6 +7,7 @@ use Doctrine\SqlFormatter\CliHighlighter; use Doctrine\SqlFormatter\HtmlHighlighter; use Doctrine\SqlFormatter\SqlFormatter; +use Doctrine\SqlFormatter\Tokenizer; use PHPUnit\Framework\TestCase; use function assert; use function defined; @@ -23,18 +24,26 @@ final class SqlFormatterTest extends TestCase /** @var string[] */ private $sqlData; + /** @var Tokenizer */ + private static $tokenizer; + /** @var SqlFormatter */ - private static $formatter; + private $formatter; /** @var HtmlHighlighter */ - private static $highlighter; + private $highlighter; public static function setUpBeforeClass() : void + { + self::$tokenizer = new Tokenizer(); + } + + protected function setUp() : void { // Force SqlFormatter to run in non-CLI mode for tests - self::$highlighter = new HtmlHighlighter(); + $this->highlighter = new HtmlHighlighter(); - self::$formatter = new SqlFormatter(self::$highlighter); + $this->formatter = new SqlFormatter(self::$tokenizer, $this->highlighter); } /** @@ -42,7 +51,7 @@ public static function setUpBeforeClass() : void */ public function testFormatHighlight(string $sql, string $html) : void { - $this->assertEquals(trim($html), trim(self::$formatter->format($sql))); + $this->assertEquals(trim($html), trim($this->formatter->format($sql))); } /** @@ -50,7 +59,7 @@ public function testFormatHighlight(string $sql, string $html) : void */ public function testFormat(string $sql, string $html) : void { - $this->assertEquals(trim($html), trim(self::$formatter->format($sql, false))); + $this->assertEquals(trim($html), trim($this->formatter->format($sql, false))); } /** @@ -58,7 +67,7 @@ public function testFormat(string $sql, string $html) : void */ public function testHighlight(string $sql, string $html) : void { - $this->assertEquals(trim($html), trim(self::$formatter->highlight($sql))); + $this->assertEquals(trim($html), trim($this->formatter->highlight($sql))); } public function testHighlightBinary() : void @@ -77,7 +86,7 @@ public function testHighlightBinary() : void $binaryData . ' AS BINARY'; - $this->assertEquals(trim($html), trim(self::$formatter->highlight($sql))); + $this->assertEquals(trim($html), trim($this->formatter->highlight($sql))); } /** @@ -85,7 +94,7 @@ public function testHighlightBinary() : void */ public function testCliHighlight(string $sql, string $html) : void { - $formatter = new SqlFormatter(new CliHighlighter()); + $formatter = new SqlFormatter(self::$tokenizer, new CliHighlighter()); $this->assertEquals(trim($html), trim($formatter->format($sql))); } @@ -94,18 +103,18 @@ public function testCliHighlight(string $sql, string $html) : void */ public function testCompress(string $sql, string $html) : void { - $this->assertEquals(trim($html), trim(self::$formatter->compress($sql))); + $this->assertEquals(trim($html), trim($this->formatter->compress($sql))); } public function testUsePre() : void { - self::$highlighter->usePre = false; - $actual = self::$formatter->highlight('test'); + $this->highlighter->usePre = false; + $actual = $this->formatter->highlight('test'); $expected = 'test'; $this->assertEquals($actual, $expected); - self::$highlighter->usePre = true; - $actual = self::$formatter->highlight('test'); + $this->highlighter->usePre = true; + $actual = $this->formatter->highlight('test'); $expected = '
' .
             'test
'; $this->assertEquals($actual, $expected); @@ -113,7 +122,7 @@ public function testUsePre() : void public function testCacheStats() : void { - $stats = self::$formatter->getCacheStats(); + $stats = self::$tokenizer->getCacheStats(); $this->assertGreaterThan(1, $stats['hits']); } @@ -241,14 +250,14 @@ public function sqlData() : array $clihighlight = array(); foreach($this->sqlData as $sql) { - $formatHighlight[] = trim(self::$formatter->format($sql)); - $highlight[] = trim(self::$formatter->highlight($sql)); - $format[] = trim(self::$formatter->format($sql, false)); - $compress[] = trim(self::$formatter->compress($sql)); - - self::$formatter->cli = true; - $clihighlight[] = trim(self::$formatter->format($sql)); - self::$formatter->cli = false; + $formatHighlight[] = trim($this->formatter->format($sql)); + $highlight[] = trim($this->formatter->highlight($sql)); + $format[] = trim($this->formatter->format($sql, false)); + $compress[] = trim($this->formatter->compress($sql)); + + $this->formatter->cli = true; + $clihighlight[] = trim($this->formatter->format($sql)); + $this->formatter->cli = false; } file_put_contents(__DIR__."/format-highlight.html", implode("\n\n",$formatHighlight)); diff --git a/tests/performance.php b/tests/performance.php index 392d06e..74e6bd4 100644 --- a/tests/performance.php +++ b/tests/performance.php @@ -5,12 +5,14 @@ require __DIR__ . '/../vendor/autoload.php'; use Doctrine\SqlFormatter\SqlFormatter; +use Doctrine\SqlFormatter\Tokenizer; -$formatter = new SqlFormatter(); +$tokenizer = new Tokenizer(); +$tokenizer->maxCachekeySize = 15; //this is the default value //set to '0' to disable caching //a value between 10 and 20 seems to give the best result -$formatter->maxCachekeySize = 15; +$formatter = new SqlFormatter($tokenizer); //the sample query file is filled with install scripts for PrestaShop //and some sample catalog data from Magento @@ -40,7 +42,7 @@ $end = microtime(true); ?> -

Formatted queries using a maxCachekeySize of maxCachekeySize ?>

+

Formatted queries using a maxCachekeySize of maxCachekeySize ?>

Average query length of characters

Took seconds total, @@ -49,4 +51,4 @@

Used bytes of memory

-

Cache Stats

getCacheStats(), true) ?>
+

Cache Stats

getCacheStats(), true) ?>