up: update the textscan some logic and update readme

gookit · Oct 15, 2022 · 75780ae · 75780ae
1 parent 1b9f75c
commit 75780ae
Show file tree

Hide file tree

Showing 7 changed files with 306 additions and 42 deletions.
diff --git a/strutil/README.md b/strutil/README.md
@@ -8,7 +8,7 @@ This is a go string operate util package.
 ## Install
 
 ```bash
-go get github.com/gookit/goutil/dump
+go get github.com/gookit/goutil/strutil
 ```
 
 ## Usage

diff --git a/strutil/textscan/README.md b/strutil/textscan/README.md
@@ -0,0 +1,129 @@
+# TextScan
+
+Package `textscan` implements text scanner for quickly parse text contents.
+Can use for parse like INI, Properties format contents.
+
+## Install
+
+```shell
+go get github.com/gookit/goutil/strutil/textscan
+```
+
+## Examples
+
+```go
+package main
+
+import (
+	"fmt"
+
+	"github.com/gookit/goutil/dump"
+	"github.com/gookit/goutil/strutil/textscan"
+	"github.com/gookit/goutil/testutil/assert"
+)
+
+func main() {
+	ts := textscan.TextScanner{}
+	ts.AddMatchers(
+		&textscan.CommentsMatcher{},
+		&textscan.KeyValueMatcher{},
+	)
+
+	ts.SetInput(`
+# comments 1
+name = inhere
+
+// comments 2
+age = 28
+
+/*
+multi line
+comments 3
+*/
+desc = '''
+a multi
+line string
+'''
+`)
+
+	data := make(map[string]string)
+	err := ts.Each(func(t textscan.Token) {
+		fmt.Println("====> Token kind:", t.Kind())
+		fmt.Println(t.String())
+
+		if t.Kind() == textscan.TokValue {
+			v := t.(*textscan.ValueToken)
+			data[v.Key()] = v.Value()
+		}
+	})
+
+	dump.P(data, err)
+}
+```
+
+**Output:**:
+
+```shell
+====> Token kind: Comments
+# comments 1
+====> Token kind: Value
+key: name
+value: "inhere"
+comments: 
+====> Token kind: Comments
+// comments 2
+====> Token kind: Value
+key: age
+value: "28"
+comments: 
+====> Token kind: Comments
+/*
+multi line
+comments 3
+*/
+====> Token kind: Value
+key: desc
+value: "\n\na multi\nline string\n"
+comments: 
+
+==== Collected data:
+map[string]string { #len=3
+  "desc": string("
+
+a multi
+line string
+"), #len=22
+  "name": string("inhere"), #len=6
+  "age": string("28"), #len=2
+},
+```
+
+## Functions
+
+```go
+package textscan // import "github.com/gookit/goutil/strutil/textscan"
+
+func AddKind(k Kind, name string)
+func CommentsDetect(text string) (ok, more bool, err error)
+func CommentsDetectEnd(line string) bool
+func IsKindToken(k Kind, tok Token) bool
+func KindString(k Kind) string
+type BaseToken struct{ ... }
+type CommentToken struct{ ... }
+    func NewCommentToken(val string) *CommentToken
+type CommentsMatcher struct{ ... }
+type EmptyToken struct{ ... }
+    func NewEmptyToken() *EmptyToken
+type HandleFn func(t Token)
+type KeyValueMatcher struct{ ... }
+type Kind rune
+    const TokInvalid Kind = iota ...
+type LiteToken interface{ ... }
+type Matcher interface{ ... }
+type Parser struct{ ... }
+    func NewParser(fn HandleFn) *Parser
+type TextScanner struct{ ... }
+    func NewScanner(in interface{}) *TextScanner
+type Token interface{ ... }
+type ValueToken struct{ ... }
+```
diff --git a/strutil/textscan/kvparse.go b/strutil/textscan/kvparse.go
@@ -14,20 +14,24 @@ import (
 const (
 	MultiLineValMarkS = "'''"
 	MultiLineValMarkD = `"""`
-	MultiLineValMarkH = "<<<" // at start
-	MultiLineValMarkQ = "\\"  // at end
+	MultiLineValMarkH = "<<<" // heredoc at start. <<<TXT ... TXT
+	MultiLineValMarkQ = "\\"  // at end. eg: properties contents
 	MultiLineCmtEnd   = "*/"
 	// VarRefStartChars  = "${"
 )
 
 // KeyValueMatcher match key-value token.
+// Support parse `KEY=VALUE` line text contents.
 type KeyValueMatcher struct {
 	// Separator string for split key and value, default is "="
 	Separator string
 	// MergeComments collect previous comments token to value token.
 	// If set as True, on each s.Scan() please notice skip TokComments
 	MergeComments bool
+	// InlineComment parse and split inline comment
 	InlineComment bool
+	// KeyCheckFn set func check key string is valid
+	KeyCheckFn func(key string) error
 }
 
 // Match text line.
@@ -53,6 +57,13 @@ func (m *KeyValueMatcher) Match(text string, prev Token) (Token, error) {
 		return nil, errorx.Rawf("key cannot be empty: %q", str)
 	}
 
+	// check key string.
+	if m.KeyCheckFn != nil {
+		if err := m.KeyCheckFn(key); err != nil {
+			return nil, err
+		}
+	}
+
 	// handle value
 	vln := len(val)
 	tok := &ValueToken{
@@ -142,7 +153,7 @@ func (m *KeyValueMatcher) DetectEnd(mark, text string) (ok bool, val string) {
 	return
 }
 
-// ValueToken struct
+// ValueToken contains key and value contents
 type ValueToken struct {
 	BaseToken
 	m *KeyValueMatcher
@@ -174,7 +185,7 @@ func (t *ValueToken) Comment() string {
 // Value text string.
 func (t *ValueToken) Value() string {
 	if len(t.values) > 0 {
-		return strings.Join(t.values, "\n")
+		return strings.Join(t.values, "")
 	}
 	return t.value
 }
@@ -184,6 +195,11 @@ func (t *ValueToken) HasMore() bool {
 	return t.more
 }
 
+// HasComment for the value
+func (t *ValueToken) HasComment() bool {
+	return t.comment != nil
+}
+
 // MergeSame comments token
 func (t *ValueToken) MergeSame(_ Token) error {
 	return errors.New("merge value token not allowed")
@@ -215,8 +231,11 @@ func (t *ValueToken) ScanMore(ts *TextScanner) error {
 	}
 }
 
-// CommentsMatcher struct
+// CommentsMatcher match comments lines.
+// will auto merge prev comments token
 type CommentsMatcher struct {
+	// InlineChars for match inline comments. default is: #
+	InlineChars []byte
 	// MatchFn for comments line
 	// - mark 	useful on multi line comments
 	MatchFn func(text string) (ok, more bool, err error)
@@ -227,7 +246,18 @@ type CommentsMatcher struct {
 // Match comments token
 func (m *CommentsMatcher) Match(text string, prev Token) (Token, error) {
 	if m.MatchFn == nil {
-		m.MatchFn = CommentsDetect
+		if len(m.InlineChars) == 0 {
+			m.InlineChars = []byte{'#'}
+		}
+
+		m.MatchFn = func(text string) (ok, more bool, err error) {
+			return CommentsDetect(text, m.InlineChars)
+		}
+	}
+
+	// skip empty line
+	if text = strings.TrimSpace(text); text == "" {
+		return nil, nil
 	}
 
 	ok, more, err := m.MatchFn(text)
@@ -257,19 +287,28 @@ func (m *CommentsMatcher) Match(text string, prev Token) (Token, error) {
 }
 
 // CommentsDetect check.
-func CommentsDetect(text string) (ok, more bool, err error) {
-	str := strings.TrimSpace(text)
+//
+// - inlineChars: #
+//
+// default match:
+//
+//   - inline #, //
+//   - multi line: /*
+func CommentsDetect(str string, inlineChars []byte) (ok, more bool, err error) {
 	ln := len(str)
 	if ln == 0 {
 		return
 	}
 
-	// a line comments
-	if str[0] == '#' || str[0] == '!' {
-		ok = true
-		return
+	// match inline comments by prefix char.
+	for _, prefix := range inlineChars {
+		if str[0] == prefix {
+			ok = true
+			return
+		}
 	}
 
+	// match start withs // OR /*
 	if str[0] == '/' {
 		if ln < 2 {
 			err = errorx.Rawf("invalid contents %q", str)