From 4c9a80580d1af536d710765337215841b1319564 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michael=20Vo=C5=99=C3=AD=C5=A1ek?= Date: Tue, 11 Jun 2024 15:05:44 +0200 Subject: [PATCH 1/5] Refactor Tokenizer::createNextToken() to accept string and offset --- src/Tokenizer.php | 67 +++++++++++++++++++++++++---------------------- 1 file changed, 35 insertions(+), 32 deletions(-) diff --git a/src/Tokenizer.php b/src/Tokenizer.php index 3ec9c91..50613e9 100644 --- a/src/Tokenizer.php +++ b/src/Tokenizer.php @@ -800,17 +800,17 @@ public function tokenize(string $string): Cursor { $tokens = []; - $token = null; + $upper = strtoupper($string); + $offset = 0; + $token = null; // Keep processing the string until it is empty - while ($string !== '') { + while ($offset < strlen($string)) { // Get the next token and the token type - $token = $this->createNextToken($string, $token); + $token = $this->createNextToken($string, $upper, $offset, $token); + $offset += strlen($token->value()); $tokens[] = $token; - - // Advance the string - $string = substr($string, strlen($token->value())); } return new Cursor($tokens); @@ -822,30 +822,33 @@ public function tokenize(string $string): Cursor * are all their own tokens. * * @param string $string The SQL string + * @param string $upper The SQL string in upper case * @param Token|null $previous The result of the previous createNextToken() call * * @return Token An associative array containing the type and value of the token. */ - private function createNextToken(string $string, Token|null $previous = null): Token + private function createNextToken(string $string, string $upper, int $offset, Token|null $previous = null): Token { + $stringSlow = substr($string, $offset); + $matches = []; // Whitespace - if (preg_match('/^\s+/', $string, $matches)) { + if (preg_match('/^\s+/', $stringSlow, $matches)) { return new Token(Token::TOKEN_TYPE_WHITESPACE, $matches[0]); } // Comment if ( - $string[0] === '#' || - (isset($string[1]) && ($string[0] === '-' && $string[1] === '-') || - (isset($string[1]) && $string[0] === '/' && $string[1] === '*')) + $stringSlow[0] === '#' || + (isset($stringSlow[1]) && ($stringSlow[0] === '-' && $stringSlow[1] === '-') || + (isset($stringSlow[1]) && $stringSlow[0] === '/' && $stringSlow[1] === '*')) ) { // Comment until end of line - if ($string[0] === '-' || $string[0] === '#') { - $last = strpos($string, "\n"); + if ($stringSlow[0] === '-' || $stringSlow[0] === '#') { + $last = strpos($stringSlow, "\n"); $type = Token::TOKEN_TYPE_COMMENT; } else { // Comment until closing comment tag - $pos = strpos($string, '*/', 2); + $pos = strpos($stringSlow, '*/', 2); $last = $pos !== false ? $pos + 2 : false; @@ -853,33 +856,33 @@ private function createNextToken(string $string, Token|null $previous = null): T } if ($last === false) { - $last = strlen($string); + $last = strlen($stringSlow); } - return new Token($type, substr($string, 0, $last)); + return new Token($type, substr($stringSlow, 0, $last)); } // Quoted String - if ($string[0] === '"' || $string[0] === '\'' || $string[0] === '`' || $string[0] === '[') { + if ($stringSlow[0] === '"' || $stringSlow[0] === '\'' || $stringSlow[0] === '`' || $stringSlow[0] === '[') { return new Token( - ($string[0] === '`' || $string[0] === '[' + ($stringSlow[0] === '`' || $stringSlow[0] === '[' ? Token::TOKEN_TYPE_BACKTICK_QUOTE : Token::TOKEN_TYPE_QUOTE), - $this->getNextQuotedString($string), + $this->getNextQuotedString($stringSlow), ); } // User-defined Variable - if (($string[0] === '@' || $string[0] === ':') && isset($string[1])) { + if (($stringSlow[0] === '@' || $stringSlow[0] === ':') && isset($stringSlow[1])) { $value = null; $type = Token::TOKEN_TYPE_VARIABLE; // If the variable name is quoted - if ($string[1] === '"' || $string[1] === '\'' || $string[1] === '`') { - $value = $string[0] . $this->getNextQuotedString(substr($string, 1)); + if ($stringSlow[1] === '"' || $stringSlow[1] === '\'' || $stringSlow[1] === '`') { + $value = $stringSlow[0] . $this->getNextQuotedString(substr($stringSlow, 1)); } else { // Non-quoted variable name - preg_match('/^(' . $string[0] . '[\w.$]+)/', $string, $matches); + preg_match('/^(' . $stringSlow[0] . '[\w.$]+)/', $stringSlow, $matches); if ($matches) { $value = $matches[1]; } @@ -894,7 +897,7 @@ private function createNextToken(string $string, Token|null $previous = null): T if ( preg_match( '/^(\d+(\.\d+)?|0x[\da-fA-F]+|0b[01]+)($|\s|"\'`|' . $this->regexBoundaries . ')/', - $string, + $stringSlow, $matches, ) ) { @@ -902,14 +905,14 @@ private function createNextToken(string $string, Token|null $previous = null): T } // Boundary Character (punctuation and symbols) - if (preg_match('/^(' . $this->regexBoundaries . ')/', $string, $matches)) { + if (preg_match('/^(' . $this->regexBoundaries . ')/', $stringSlow, $matches)) { return new Token(Token::TOKEN_TYPE_BOUNDARY, $matches[1]); } // A reserved word cannot be preceded by a '.' // this makes it so in "mytable.from", "from" is not considered a reserved word if ($previous === null || $previous->value() !== '.') { - $upper = strtoupper($string); + $upper = strtoupper($stringSlow); // Top Level Reserved Word if ( preg_match( @@ -920,7 +923,7 @@ private function createNextToken(string $string, Token|null $previous = null): T ) { return new Token( Token::TOKEN_TYPE_RESERVED_TOPLEVEL, - substr($string, 0, strlen($matches[1])), + substr($stringSlow, 0, strlen($matches[1])), ); } @@ -934,7 +937,7 @@ private function createNextToken(string $string, Token|null $previous = null): T ) { return new Token( Token::TOKEN_TYPE_RESERVED_NEWLINE, - substr($string, 0, strlen($matches[1])), + substr($stringSlow, 0, strlen($matches[1])), ); } @@ -948,24 +951,24 @@ private function createNextToken(string $string, Token|null $previous = null): T ) { return new Token( Token::TOKEN_TYPE_RESERVED, - substr($string, 0, strlen($matches[1])), + substr($stringSlow, 0, strlen($matches[1])), ); } } // A function must be succeeded by '(' // this makes it so "count(" is considered a function, but "count" alone is not - $upper = strtoupper($string); + $upper = strtoupper($stringSlow); // function if (preg_match('/^(' . $this->regexFunction . '[(]|\s|[)])/', $upper, $matches)) { return new Token( Token::TOKEN_TYPE_RESERVED, - substr($string, 0, strlen($matches[1]) - 1), + substr($stringSlow, 0, strlen($matches[1]) - 1), ); } // Non reserved word - preg_match('/^(.*?)($|\s|["\'`]|' . $this->regexBoundaries . ')/', $string, $matches); + preg_match('/^(.*?)($|\s|["\'`]|' . $this->regexBoundaries . ')/', $stringSlow, $matches); return new Token(Token::TOKEN_TYPE_WORD, $matches[1]); } From 15887a8229e7b714d2bf59b3144de3fcdd9516bd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michael=20Vo=C5=99=C3=AD=C5=A1ek?= Date: Tue, 11 Jun 2024 15:07:31 +0200 Subject: [PATCH 2/5] Change "^" to "\G" in regexes to match offset start `^` matches "string start", `\G` is the same but matches start given by the 5th `preg_match` `$offset` argument. --- src/Tokenizer.php | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/src/Tokenizer.php b/src/Tokenizer.php index 50613e9..f293681 100644 --- a/src/Tokenizer.php +++ b/src/Tokenizer.php @@ -833,7 +833,7 @@ private function createNextToken(string $string, string $upper, int $offset, Tok $matches = []; // Whitespace - if (preg_match('/^\s+/', $stringSlow, $matches)) { + if (preg_match('/\G\s+/', $stringSlow, $matches)) { return new Token(Token::TOKEN_TYPE_WHITESPACE, $matches[0]); } @@ -882,7 +882,7 @@ private function createNextToken(string $string, string $upper, int $offset, Tok $value = $stringSlow[0] . $this->getNextQuotedString(substr($stringSlow, 1)); } else { // Non-quoted variable name - preg_match('/^(' . $stringSlow[0] . '[\w.$]+)/', $stringSlow, $matches); + preg_match('/\G(' . $stringSlow[0] . '[\w.$]+)/', $stringSlow, $matches); if ($matches) { $value = $matches[1]; } @@ -896,7 +896,7 @@ private function createNextToken(string $string, string $upper, int $offset, Tok // Number (decimal, binary, or hex) if ( preg_match( - '/^(\d+(\.\d+)?|0x[\da-fA-F]+|0b[01]+)($|\s|"\'`|' . $this->regexBoundaries . ')/', + '/\G(\d+(\.\d+)?|0x[\da-fA-F]+|0b[01]+)($|\s|"\'`|' . $this->regexBoundaries . ')/', $stringSlow, $matches, ) @@ -905,7 +905,7 @@ private function createNextToken(string $string, string $upper, int $offset, Tok } // Boundary Character (punctuation and symbols) - if (preg_match('/^(' . $this->regexBoundaries . ')/', $stringSlow, $matches)) { + if (preg_match('/\G(' . $this->regexBoundaries . ')/', $stringSlow, $matches)) { return new Token(Token::TOKEN_TYPE_BOUNDARY, $matches[1]); } @@ -916,7 +916,7 @@ private function createNextToken(string $string, string $upper, int $offset, Tok // Top Level Reserved Word if ( preg_match( - '/^(' . $this->regexReservedToplevel . ')($|\s|' . $this->regexBoundaries . ')/', + '/\G(' . $this->regexReservedToplevel . ')($|\s|' . $this->regexBoundaries . ')/', $upper, $matches, ) @@ -930,7 +930,7 @@ private function createNextToken(string $string, string $upper, int $offset, Tok // Newline Reserved Word if ( preg_match( - '/^(' . $this->regexReservedNewline . ')($|\s|' . $this->regexBoundaries . ')/', + '/\G(' . $this->regexReservedNewline . ')($|\s|' . $this->regexBoundaries . ')/', $upper, $matches, ) @@ -944,7 +944,7 @@ private function createNextToken(string $string, string $upper, int $offset, Tok // Other Reserved Word if ( preg_match( - '/^(' . $this->regexReserved . ')($|\s|' . $this->regexBoundaries . ')/', + '/\G(' . $this->regexReserved . ')($|\s|' . $this->regexBoundaries . ')/', $upper, $matches, ) @@ -960,7 +960,7 @@ private function createNextToken(string $string, string $upper, int $offset, Tok // this makes it so "count(" is considered a function, but "count" alone is not $upper = strtoupper($stringSlow); // function - if (preg_match('/^(' . $this->regexFunction . '[(]|\s|[)])/', $upper, $matches)) { + if (preg_match('/\G(' . $this->regexFunction . '[(]|\s|[)])/', $upper, $matches)) { return new Token( Token::TOKEN_TYPE_RESERVED, substr($stringSlow, 0, strlen($matches[1]) - 1), @@ -968,7 +968,7 @@ private function createNextToken(string $string, string $upper, int $offset, Tok } // Non reserved word - preg_match('/^(.*?)($|\s|["\'`]|' . $this->regexBoundaries . ')/', $stringSlow, $matches); + preg_match('/\G(.*?)($|\s|["\'`]|' . $this->regexBoundaries . ')/', $stringSlow, $matches); return new Token(Token::TOKEN_TYPE_WORD, $matches[1]); } @@ -1000,7 +1000,7 @@ private function getNextQuotedString(string $string): string if ( preg_match( <<<'EOD' - ~^(?>(?sx) + ~\G(?>(?sx) (?:`[^`]*(?:$|`))+ |(?:\[[^\]]*($|\]))(?:\][^\]]*(?:$|\]))* |(?:"[^"\\]*(?:\\.[^"\\]*)*(?:"|$))+ From b3b1c95872847f57ddebe806337de99f8789453a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michael=20Vo=C5=99=C3=AD=C5=A1ek?= Date: Tue, 11 Jun 2024 15:08:07 +0200 Subject: [PATCH 3/5] Refactor Tokenizer::getNextQuotedString() to accept string and offset --- src/Tokenizer.php | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/Tokenizer.php b/src/Tokenizer.php index f293681..3483ad5 100644 --- a/src/Tokenizer.php +++ b/src/Tokenizer.php @@ -868,7 +868,7 @@ private function createNextToken(string $string, string $upper, int $offset, Tok ($stringSlow[0] === '`' || $stringSlow[0] === '[' ? Token::TOKEN_TYPE_BACKTICK_QUOTE : Token::TOKEN_TYPE_QUOTE), - $this->getNextQuotedString($stringSlow), + $this->getNextQuotedString($string, $offset), ); } @@ -879,7 +879,7 @@ private function createNextToken(string $string, string $upper, int $offset, Tok // If the variable name is quoted if ($stringSlow[1] === '"' || $stringSlow[1] === '\'' || $stringSlow[1] === '`') { - $value = $stringSlow[0] . $this->getNextQuotedString(substr($stringSlow, 1)); + $value = $stringSlow[0] . $this->getNextQuotedString($string, $offset + 1); } else { // Non-quoted variable name preg_match('/\G(' . $stringSlow[0] . '[\w.$]+)/', $stringSlow, $matches); @@ -988,7 +988,7 @@ private function quoteRegex(array $strings): array ); } - private function getNextQuotedString(string $string): string + private function getNextQuotedString(string $string, int $offset): string { $ret = ''; @@ -1009,6 +1009,8 @@ private function getNextQuotedString(string $string): string EOD, $string, $matches, + 0, + $offset, ) ) { $ret = $matches[0]; From 459f48f59da9f6749ab9949cbabf7f79afbce016 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michael=20Vo=C5=99=C3=AD=C5=A1ek?= Date: Fri, 31 May 2024 22:08:28 +0200 Subject: [PATCH 4/5] Refactor all "strtoupper($stringSlow)" code --- src/Tokenizer.php | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/src/Tokenizer.php b/src/Tokenizer.php index 3483ad5..12f47e9 100644 --- a/src/Tokenizer.php +++ b/src/Tokenizer.php @@ -912,13 +912,14 @@ private function createNextToken(string $string, string $upper, int $offset, Tok // A reserved word cannot be preceded by a '.' // this makes it so in "mytable.from", "from" is not considered a reserved word if ($previous === null || $previous->value() !== '.') { - $upper = strtoupper($stringSlow); // Top Level Reserved Word if ( preg_match( '/\G(' . $this->regexReservedToplevel . ')($|\s|' . $this->regexBoundaries . ')/', $upper, $matches, + 0, + $offset, ) ) { return new Token( @@ -933,6 +934,8 @@ private function createNextToken(string $string, string $upper, int $offset, Tok '/\G(' . $this->regexReservedNewline . ')($|\s|' . $this->regexBoundaries . ')/', $upper, $matches, + 0, + $offset, ) ) { return new Token( @@ -947,6 +950,8 @@ private function createNextToken(string $string, string $upper, int $offset, Tok '/\G(' . $this->regexReserved . ')($|\s|' . $this->regexBoundaries . ')/', $upper, $matches, + 0, + $offset, ) ) { return new Token( @@ -958,9 +963,8 @@ private function createNextToken(string $string, string $upper, int $offset, Tok // A function must be succeeded by '(' // this makes it so "count(" is considered a function, but "count" alone is not - $upper = strtoupper($stringSlow); // function - if (preg_match('/\G(' . $this->regexFunction . '[(]|\s|[)])/', $upper, $matches)) { + if (preg_match('/\G(' . $this->regexFunction . '[(]|\s|[)])/', $upper, $matches, 0, $offset)) { return new Token( Token::TOKEN_TYPE_RESERVED, substr($stringSlow, 0, strlen($matches[1]) - 1), From af658afaad775b5510a62c4f1d8a76f9beee6e25 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michael=20Vo=C5=99=C3=AD=C5=A1ek?= Date: Tue, 11 Jun 2024 15:09:14 +0200 Subject: [PATCH 5/5] Refactor all remaining "$stringSlow" code --- src/Tokenizer.php | 48 +++++++++++++++++++++++------------------------ 1 file changed, 24 insertions(+), 24 deletions(-) diff --git a/src/Tokenizer.php b/src/Tokenizer.php index 12f47e9..9277364 100644 --- a/src/Tokenizer.php +++ b/src/Tokenizer.php @@ -829,26 +829,24 @@ public function tokenize(string $string): Cursor */ private function createNextToken(string $string, string $upper, int $offset, Token|null $previous = null): Token { - $stringSlow = substr($string, $offset); - $matches = []; // Whitespace - if (preg_match('/\G\s+/', $stringSlow, $matches)) { + if (preg_match('/\G\s+/', $string, $matches, 0, $offset)) { return new Token(Token::TOKEN_TYPE_WHITESPACE, $matches[0]); } // Comment if ( - $stringSlow[0] === '#' || - (isset($stringSlow[1]) && ($stringSlow[0] === '-' && $stringSlow[1] === '-') || - (isset($stringSlow[1]) && $stringSlow[0] === '/' && $stringSlow[1] === '*')) + $string[$offset] === '#' || + (isset($string[$offset + 1]) && ($string[$offset] === '-' && $string[$offset + 1] === '-') || + (isset($string[$offset + 1]) && $string[$offset] === '/' && $string[$offset + 1] === '*')) ) { // Comment until end of line - if ($stringSlow[0] === '-' || $stringSlow[0] === '#') { - $last = strpos($stringSlow, "\n"); + if ($string[$offset] === '-' || $string[$offset] === '#') { + $last = strpos($string, "\n", $offset); $type = Token::TOKEN_TYPE_COMMENT; } else { // Comment until closing comment tag - $pos = strpos($stringSlow, '*/', 2); + $pos = strpos($string, '*/', $offset + 2); $last = $pos !== false ? $pos + 2 : false; @@ -856,16 +854,16 @@ private function createNextToken(string $string, string $upper, int $offset, Tok } if ($last === false) { - $last = strlen($stringSlow); + $last = strlen($string); } - return new Token($type, substr($stringSlow, 0, $last)); + return new Token($type, substr($string, $offset, $last - $offset)); } // Quoted String - if ($stringSlow[0] === '"' || $stringSlow[0] === '\'' || $stringSlow[0] === '`' || $stringSlow[0] === '[') { + if ($string[$offset] === '"' || $string[$offset] === '\'' || $string[$offset] === '`' || $string[$offset] === '[') { return new Token( - ($stringSlow[0] === '`' || $stringSlow[0] === '[' + ($string[$offset] === '`' || $string[$offset] === '[' ? Token::TOKEN_TYPE_BACKTICK_QUOTE : Token::TOKEN_TYPE_QUOTE), $this->getNextQuotedString($string, $offset), @@ -873,16 +871,16 @@ private function createNextToken(string $string, string $upper, int $offset, Tok } // User-defined Variable - if (($stringSlow[0] === '@' || $stringSlow[0] === ':') && isset($stringSlow[1])) { + if (($string[$offset] === '@' || $string[$offset] === ':') && isset($string[$offset + 1])) { $value = null; $type = Token::TOKEN_TYPE_VARIABLE; // If the variable name is quoted - if ($stringSlow[1] === '"' || $stringSlow[1] === '\'' || $stringSlow[1] === '`') { - $value = $stringSlow[0] . $this->getNextQuotedString($string, $offset + 1); + if ($string[$offset + 1] === '"' || $string[$offset + 1] === '\'' || $string[$offset + 1] === '`') { + $value = $string[$offset] . $this->getNextQuotedString($string, $offset + 1); } else { // Non-quoted variable name - preg_match('/\G(' . $stringSlow[0] . '[\w.$]+)/', $stringSlow, $matches); + preg_match('/\G(' . $string[$offset] . '[\w.$]+)/', $string, $matches, 0, $offset); if ($matches) { $value = $matches[1]; } @@ -897,15 +895,17 @@ private function createNextToken(string $string, string $upper, int $offset, Tok if ( preg_match( '/\G(\d+(\.\d+)?|0x[\da-fA-F]+|0b[01]+)($|\s|"\'`|' . $this->regexBoundaries . ')/', - $stringSlow, + $string, $matches, + 0, + $offset, ) ) { return new Token(Token::TOKEN_TYPE_NUMBER, $matches[1]); } // Boundary Character (punctuation and symbols) - if (preg_match('/\G(' . $this->regexBoundaries . ')/', $stringSlow, $matches)) { + if (preg_match('/\G(' . $this->regexBoundaries . ')/', $string, $matches, 0, $offset)) { return new Token(Token::TOKEN_TYPE_BOUNDARY, $matches[1]); } @@ -924,7 +924,7 @@ private function createNextToken(string $string, string $upper, int $offset, Tok ) { return new Token( Token::TOKEN_TYPE_RESERVED_TOPLEVEL, - substr($stringSlow, 0, strlen($matches[1])), + substr($string, $offset, strlen($matches[1])), ); } @@ -940,7 +940,7 @@ private function createNextToken(string $string, string $upper, int $offset, Tok ) { return new Token( Token::TOKEN_TYPE_RESERVED_NEWLINE, - substr($stringSlow, 0, strlen($matches[1])), + substr($string, $offset, strlen($matches[1])), ); } @@ -956,7 +956,7 @@ private function createNextToken(string $string, string $upper, int $offset, Tok ) { return new Token( Token::TOKEN_TYPE_RESERVED, - substr($stringSlow, 0, strlen($matches[1])), + substr($string, $offset, strlen($matches[1])), ); } } @@ -967,12 +967,12 @@ private function createNextToken(string $string, string $upper, int $offset, Tok if (preg_match('/\G(' . $this->regexFunction . '[(]|\s|[)])/', $upper, $matches, 0, $offset)) { return new Token( Token::TOKEN_TYPE_RESERVED, - substr($stringSlow, 0, strlen($matches[1]) - 1), + substr($string, $offset, strlen($matches[1]) - 1), ); } // Non reserved word - preg_match('/\G(.*?)($|\s|["\'`]|' . $this->regexBoundaries . ')/', $stringSlow, $matches); + preg_match('/\G(.*?)($|\s|["\'`]|' . $this->regexBoundaries . ')/', $string, $matches, 0, $offset); return new Token(Token::TOKEN_TYPE_WORD, $matches[1]); }