up: use the goutil/strutil/textscan refator the parse logic

gookit · Oct 15, 2022 · 5dbc2f4 · 5dbc2f4
1 parent 052e9b2
commit 5dbc2f4
Show file tree

Hide file tree

Showing 8 changed files with 155 additions and 113 deletions.
diff --git a/README.md b/README.md
@@ -15,13 +15,20 @@ INI contents parser by golang, INI config data management library.
 
 - Easy to use(get: `Int` `Int64` `Bool` `String` `StringMap` ..., set: `Set`)
 - Support multi file, data load
-- Support for rebinding data to structure
+- Support for decode data to struct
 - Support data override merge
 - Support parse ENV variable
 - Support comments start with  `;` `#`
 - Complete unit test(coverage > 90%)
 - Support variable reference, default compatible with Python's configParser format `%(VAR)s`
-- Sub-package `dotenv` that supports importing ENV data from files (eg `.env`)
+
+### [Parser](./parser)
+
+Package `parser` is a Parser for parse INI format content to golang data
+
+### [Dotenv](./dotenv)
+
+Package `dotenv` that supports importing ENV data from files (eg `.env`)
 
 ## More formats
 
@@ -219,17 +226,17 @@ type Options struct {
 }
 ```
 
-- setting options for default instance
+Setting options for default instance:
 
 ```go
 ini.WithOptions(ini.ParseEnv,ini.ParseVar)
 ```
 
-- setting options with new instance
+Setting options with new instance:
 
 ```go
 cfg := ini.New()
-cfg.WithOptions(ini.ParseEnv,ini.ParseVar, func (opts *Options) {
+cfg.WithOptions(ini.ParseEnv, ini.ParseVar, func (opts *Options) {
 	opts.SectionSep = ":"
 	opts.DefSection = "default"
 })

diff --git a/README.zh-CN.md b/README.zh-CN.md
@@ -11,15 +11,25 @@ INI格式内容解析; 使用INI格式作为配置，配置数据的加载，管
 
 > **[EN README](README.md)**
 
+## 功能简介
+
 - 使用简单(获取: `Int` `Int64` `Bool` `String` `StringMap` ..., 设置: `Set` )
 - 支持多文件，数据加载
 - 支持数据覆盖合并
-- 支持将数据重新绑定到结构体
-- 支持解析 ENV 变量名
+- 支持将数据绑定到结构体
+- 支持解析 `ENV` 变量名
 - 支持使用 `;` `#` 注释一行
-- 支持变量参考，默认兼容Python的configParser格式 `%(VAR)s`
+- 支持变量参考引用
+  - 默认兼容 Python 的 configParser 格式 `%(VAR)s`
 - 完善的单元测试(coverage > 90%)
-- 子包 `dotenv` - 提供了加载解析 `.env` 文件数据为ENV环境变量
+
+### [Parser](./parser)
+
+子包 `parser` - 实现了解析 `INI` 格式内容为 Go 数据
+
+### [Dotenv](./dotenv)
+
+子包 `dotenv` - 提供了加载解析 `.env` 文件数据为ENV环境变量
 
 ## 更多格式
 

diff --git a/dotenv/dotenv.go b/dotenv/dotenv.go
@@ -169,6 +169,7 @@ func getVal(name string) (val string, ok bool) {
 
 	// cached
 	if val = loadedData[name]; val != "" {
+		ok = true
 		return
 	}
 

diff --git a/dotenv/dotenv_test.go b/dotenv/dotenv_test.go
@@ -39,6 +39,7 @@ func TestLoadFiles(t *testing.T) {
 	err := LoadFiles("./testdata/.env")
 
 	assert.NoErr(t, err)
+	assert.NotEmpty(t, LoadedData())
 	assert.Eq(t, "blog", os.Getenv("DONT_ENV_TEST"))
 	assert.Eq(t, "blog", Get("DONT_ENV_TEST"))
 }

diff --git a/parser/options.go b/parser/options.go
@@ -2,8 +2,8 @@ package parser
 
 // mode of parse data
 //
-//	ModeFull   - will parse inline array
-//	ModeLite/ModeSimple - don't parse array value
+//	ModeFull   - will parse array value and inline array
+//	ModeLite/ModeSimple - don't parse array value line
 const (
 	ModeFull   parseMode = 1
 	ModeLite   parseMode = 2

diff --git a/parser/parser.go b/parser/parser.go
@@ -39,48 +39,38 @@ import (
 	"regexp"
 	"strings"
 
+	"github.com/gookit/goutil/strutil/textscan"
 	"github.com/mitchellh/mapstructure"
 )
 
-// errSyntax is returned when there is a syntax error in an INI file.
-type errSyntax struct {
-	Line int
-	// Source The contents of the erroneous line, without leading or trailing whitespace
-	Source string
-}
+// match: [section]
+var sectionRegex = regexp.MustCompile(`^\[(.*)]$`)
 
-// Error message return
-func (e errSyntax) Error() string {
-	return fmt.Sprintf("invalid INI syntax on line %d: %s", e.Line, e.Source)
-}
+// TokSection for mark a section
+const TokSection = textscan.TokComments + 1 + iota
 
-var (
-	// match: [section]
-	sectionRegex = regexp.MustCompile(`^\[(.*)]$`)
-	// match: foo[] = val
-	assignArrRegex = regexp.MustCompile(`^([^=\[\]]+)\[][^=]*=(.*)$`)
-	// match: key = val
-	assignRegex = regexp.MustCompile(`^([^=]+)=(.*)$`)
-	// quote ' "
-	quotesRegex = regexp.MustCompile(`^(['"])(.*)(['"])$`)
-)
+// SectionMatcher match section line: [section]
+type SectionMatcher struct{}
 
-// special chars consts
-const (
-	MultiLineValMarkS = "'''"
-	MultiLineValMarkD = `"""`
-)
+// Match section line: [section]
+func (m *SectionMatcher) Match(text string, prev textscan.Token) (textscan.Token, error) {
+	line := strings.TrimSpace(text)
 
-// token consts
-const (
-	TokMLValMarkS = 'm' // multi line value by single quotes: '''
-	TokMLValMarkD = 'M' // multi line value by double quotes: """
-)
+	if matched := sectionRegex.FindStringSubmatch(line); matched != nil {
+		section := strings.TrimSpace(matched[1])
+		tok := textscan.NewStringToken(TokSection, section)
+		return tok, nil
+	}
+
+	return nil, nil
+}
 
 // Parser definition
 type Parser struct {
 	*Options
 	// parsed bool
+	// comments map, key is name
+	comments map[string]string
 
 	// for full parse(allow array, map section)
 	fullData map[string]any
@@ -185,6 +175,7 @@ func (p *Parser) init() {
 	// if p.IgnoreCase {
 	// 	p.DefSection = strings.ToLower(p.DefSection)
 	// }
+	p.comments = make(map[string]string)
 
 	if p.ParseMode == ModeFull {
 		p.fullData = make(map[string]any)
@@ -202,56 +193,72 @@ func (p *Parser) init() {
 }
 
 // ParseFrom a data scanner
-func (p *Parser) ParseFrom(in *bufio.Scanner) (bytes int64, err error) {
+func (p *Parser) ParseFrom(in *bufio.Scanner) (count int64, err error) {
 	p.init()
+	count = -1
+
+	// create scanner
+	ts := textscan.NewScanner(in)
+	ts.AddKind(TokSection, "Section")
+	ts.AddMatchers(
+		&textscan.CommentsMatcher{
+			InlineChars: []byte{'#', ';'},
+		},
+		&SectionMatcher{},
+		&textscan.KeyValueMatcher{
+			MergeComments: true,
+			InlineComment: p.InlineComment,
+		},
+	)
 
-	bytes = -1
-	lineNum := 0
 	section := p.DefSection
 
-	var readOk bool
-	for readOk = in.Scan(); readOk; readOk = in.Scan() {
-		line := in.Text()
-
-		bytes++ // newline
-		bytes += int64(len(line))
+	// scan and parsing
+	for ts.Scan() {
+		tok := ts.Token()
 
-		lineNum++
-		line = strings.TrimSpace(line)
-		if len(line) == 0 { // Skip blank lines
+		// comments has been merged to value token
+		if !tok.IsValid() || tok.Kind() == textscan.TokComments {
 			continue
 		}
 
-		if line[0] == ';' || line[0] == '#' { // Skip comments
+		if tok.Kind() == TokSection {
+			section = tok.Value()
+
+			// collect comments
+			if textscan.IsKindToken(textscan.TokComments, ts.PrevToken()) {
+				p.comments["_sec_"+section] = ts.PrevToken().Value()
+			}
 			continue
 		}
 
-		// array/slice data
-		if matched := assignArrRegex.FindStringSubmatch(line); matched != nil {
-			// skip array parse on lite mode
-			if p.ParseMode == ModeLite {
-				continue
-			}
+		// collect value
+		if tok.Kind() == textscan.TokValue {
+			vt := tok.(*textscan.ValueToken)
 
-			key, val := strings.TrimSpace(matched[1]), trimWithQuotes(matched[2])
+			var isSli bool
+			key := vt.Key()
 
-			p.collectValue(section, key, val, true)
-		} else if matched := assignRegex.FindStringSubmatch(line); matched != nil {
-			key, val := strings.TrimSpace(matched[1]), trimWithQuotes(matched[2])
+			// is array index
+			if strings.HasSuffix(key, "[]") {
+				// skip parse array on lite mode
+				if p.ParseMode == ModeLite {
+					continue
+				}
 
-			p.collectValue(section, key, val, false)
-		} else if matched := sectionRegex.FindStringSubmatch(line); matched != nil {
-			section = strings.TrimSpace(matched[1])
-		} else {
-			err = errSyntax{lineNum, line}
-			return
+				key = key[:len(key)-2]
+				isSli = true
+			}
+
+			p.collectValue(section, key, vt.Value(), isSli)
+			if vt.HasComment() {
+				p.comments[section+"_"+key] = vt.Comment()
+			}
 		}
 	}
 
-	err = in.Err()
-	if bytes < 0 {
-		bytes = 0
-	}
+	count = 0
+	err = ts.Err()
 	return
 }
 
@@ -261,15 +268,11 @@ func (p *Parser) collectValue(section, key, val string, isSlice bool) {
 		section = strings.ToLower(section)
 	}
 
-	if p.InlineComment {
-		val, _ = splitInlineComment(val)
-	}
-
 	if p.ReplaceNl {
 		val = strings.ReplaceAll(val, `\n`, "\n")
 	}
 
-	p.Collector(section, key, val, false)
+	p.Collector(section, key, val, isSlice)
 }
 
 func (p *Parser) collectFullValue(section, key, val string, isSlice bool) {
@@ -305,8 +308,7 @@ func (p *Parser) collectFullValue(section, key, val string, isSlice bool) {
 
 	switch sd := secData.(type) {
 	case map[string]any: // existed section
-		curVal, ok := sd[key]
-		if ok {
+		if curVal, ok := sd[key]; ok {
 			switch cv := curVal.(type) {
 			case string:
 				if isSlice {
@@ -351,22 +353,11 @@ func (p *Parser) collectLiteValue(sec, key, val string, _ bool) {
 	}
 }
 
-func splitInlineComment(val string) (string, string) {
-	if pos := strings.IndexRune(val, '#'); pos > -1 {
-		return strings.TrimRight(val[0:pos], " "), val[pos:]
-	}
-
-	if pos := strings.Index(val, "//"); pos > -1 {
-		return strings.TrimRight(val[0:pos], " "), val[pos:]
-	}
-	return val, ""
-}
-
 /*************************************************************
  * export data
  *************************************************************/
 
-// Decode mapping the parsed data to struct ptr
+// Decode the parsed data to struct ptr
 func (p *Parser) Decode(ptr any) error {
 	return p.MapStruct(ptr)
 }
@@ -436,6 +427,11 @@ func mapStruct(tagName string, data any, ptr any) error {
  * helper methods
  *************************************************************/
 
+// Comments get
+func (p *Parser) Comments() map[string]string {
+	return p.comments
+}
+
 // ParsedData get parsed data
 func (p *Parser) ParsedData() interface{} {
 	if p.ParseMode == ModeFull {
@@ -473,13 +469,3 @@ func (p *Parser) Reset() {
 		p.liteData = make(map[string]map[string]string)
 	}
 }
-
-func trimWithQuotes(inputVal string) (filtered string) {
-	filtered = strings.TrimSpace(inputVal)
-	groups := quotesRegex.FindStringSubmatch(filtered)
-
-	if len(groups) > 2 && groups[1] == groups[3] {
-		filtered = groups[2]
-	}
-	return
-}