From 3d5d5e4b2d37b6629c581231812ae7bf5633004f Mon Sep 17 00:00:00 2001 From: Dao Hoang Son Date: Sun, 15 Jan 2017 11:06:37 +0700 Subject: [PATCH 1/2] Update urlchar to handle character escaping. Basic idea: urlchar should accept [(ascii characters minus those that need escaping)(non ascii characters)(escaped sequences)]. The 2 later parts are taken cared of by {nonascii} and {escape} macro already. Below is the broken down explanation for the first part: ASCII characters range = [\u0020-\u007e] Skip space \u0020 = [\u0021-\u007e] Skip quotation mark \0022 = [\u0021\u0023-\u007e] Skip apostrophe \u0027 = [\u0021\u0023-\u0026\u0028-\u007e] Skip reverse solidus \u005c = [\u0021\u0023-\u0026\u0028-\u005b\u005d\u007e] Also, the left square bracket (\u005b) and right (\u005d) needs escaping themselves, hence the final regex --- scanner/scanner.go | 8 ++++---- scanner/scanner_test.go | 6 ++++++ 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/scanner/scanner.go b/scanner/scanner.go index 6a53211..4fa1ea4 100644 --- a/scanner/scanner.go +++ b/scanner/scanner.go @@ -114,7 +114,7 @@ var macros = map[string]string{ "num": `[0-9]*\.[0-9]+|[0-9]+`, "string": `"(?:{stringchar}|')*"|'(?:{stringchar}|")*'`, "stringchar": `{urlchar}|[ ]|\\{nl}`, - "urlchar": "[\u0009\u0021\u0023-\u0026\u0027-\u007E]|{nonascii}|{escape}", + "urlchar": "[\u0021\u0023-\u0026\u0028-\\\u005b\\\u005d-\u007E]|{nonascii}|{escape}", "nl": `[\n\r\f]|\r\n`, "w": `{wc}*`, "wc": `[\t\n\f\r ]`, @@ -254,10 +254,10 @@ func (s *Scanner) Next() *Token { match := matchers[TokenString].FindString(input) if match != "" { return s.emitToken(TokenString, match) - } else { - s.err = &Token{TokenError, "unclosed quotation mark", s.row, s.col} - return s.err } + + s.err = &Token{TokenError, "unclosed quotation mark", s.row, s.col} + return s.err case '/': // Comment, error or Char. if len(input) > 1 && input[1] == '*' { diff --git a/scanner/scanner_test.go b/scanner/scanner_test.go index 6e28723..e9056af 100644 --- a/scanner/scanner_test.go +++ b/scanner/scanner_test.go @@ -31,7 +31,13 @@ func TestMatchers(t *testing.T) { checkMatch("abcd", TokenIdent, "abcd") checkMatch(`"abcd"`, TokenString, `"abcd"`) + checkMatch(`"ab'cd"`, TokenString, `"ab'cd"`) + checkMatch(`"ab\"cd"`, TokenString, `"ab\"cd"`) + checkMatch(`"ab\\cd"`, TokenString, `"ab\\cd"`) checkMatch("'abcd'", TokenString, "'abcd'") + checkMatch(`'ab"cd'`, TokenString, `'ab"cd'`) + checkMatch(`'ab\'cd'`, TokenString, `'ab\'cd'`) + checkMatch(`'ab\\cd'`, TokenString, `'ab\\cd'`) checkMatch("#name", TokenHash, "#name") checkMatch("42''", TokenNumber, "42", TokenString, "''") checkMatch("4.2", TokenNumber, "4.2") From 8934a2a5a4ff94d472307cb6f4cd515be6cbcb52 Mon Sep 17 00:00:00 2001 From: Dao Hoang Son Date: Sun, 15 Jan 2017 16:29:02 +0700 Subject: [PATCH 2/2] Add urlchar macro explanation --- scanner/scanner.go | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/scanner/scanner.go b/scanner/scanner.go index 4fa1ea4..84e680e 100644 --- a/scanner/scanner.go +++ b/scanner/scanner.go @@ -114,10 +114,18 @@ var macros = map[string]string{ "num": `[0-9]*\.[0-9]+|[0-9]+`, "string": `"(?:{stringchar}|')*"|'(?:{stringchar}|")*'`, "stringchar": `{urlchar}|[ ]|\\{nl}`, - "urlchar": "[\u0021\u0023-\u0026\u0028-\\\u005b\\\u005d-\u007E]|{nonascii}|{escape}", "nl": `[\n\r\f]|\r\n`, "w": `{wc}*`, "wc": `[\t\n\f\r ]`, + + // urlchar should accept [(ascii characters minus those that need escaping)|{nonascii}|{escape}] + // ASCII characters range = `[\u0020-\u007e]` + // Skip space \u0020 = `[\u0021-\u007e]` + // Skip quotation mark \0022 = `[\u0021\u0023-\u007e]` + // Skip apostrophe \u0027 = `[\u0021\u0023-\u0026\u0028-\u007e]` + // Skip reverse solidus \u005c = `[\u0021\u0023-\u0026\u0028-\u005b\u005d\u007e]` + // Finally, the left square bracket (\u005b) and right (\u005d) needs escaping themselves + "urlchar": "[\u0021\u0023-\u0026\u0028-\\\u005b\\\u005d-\u007E]|{nonascii}|{escape}", } // productions maps the list of tokens to patterns to be expanded.