From 5dbc2f43d8b7fa89b46419dc2846bc32826de384 Mon Sep 17 00:00:00 2001
From: Inhere <in.798@qq.com>
Date: Sat, 15 Oct 2022 20:54:32 +0800
Subject: [PATCH] up: use the goutil/strutil/textscan refator the parse logic

---
 README.md             |  17 +++--
 README.zh-CN.md       |  18 ++++-
 dotenv/dotenv.go      |   1 +
 dotenv/dotenv_test.go |   1 +
 parser/options.go     |   4 +-
 parser/parser.go      | 170 +++++++++++++++++++-----------------------
 parser/parser_test.go |  55 +++++++++++---
 testdata/export.ini   |   2 +-
 8 files changed, 155 insertions(+), 113 deletions(-)

diff --git a/README.md b/README.md
index 353ec94..3936fab 100644
--- a/README.md
+++ b/README.md
@@ -15,13 +15,20 @@ INI contents parser by golang, INI config data management library.
 
 - Easy to use(get: `Int` `Int64` `Bool` `String` `StringMap` ..., set: `Set`)
 - Support multi file, data load
-- Support for rebinding data to structure
+- Support for decode data to struct
 - Support data override merge
 - Support parse ENV variable
 - Support comments start with  `;` `#`
 - Complete unit test(coverage > 90%)
 - Support variable reference, default compatible with Python's configParser format `%(VAR)s`
-- Sub-package `dotenv` that supports importing ENV data from files (eg `.env`)
+
+### [Parser](./parser)
+
+Package `parser` is a Parser for parse INI format content to golang data
+
+### [Dotenv](./dotenv)
+
+Package `dotenv` that supports importing ENV data from files (eg `.env`)
 
 ## More formats
 
@@ -219,17 +226,17 @@ type Options struct {
 }
 ```
 
-- setting options for default instance
+Setting options for default instance:
 
 ```go
 ini.WithOptions(ini.ParseEnv,ini.ParseVar)
 ```
 
-- setting options with new instance
+Setting options with new instance:
 
 ```go
 cfg := ini.New()
-cfg.WithOptions(ini.ParseEnv,ini.ParseVar, func (opts *Options) {
+cfg.WithOptions(ini.ParseEnv, ini.ParseVar, func (opts *Options) {
 	opts.SectionSep = ":"
 	opts.DefSection = "default"
 })
diff --git a/README.zh-CN.md b/README.zh-CN.md
index 94fa059..93f5336 100644
--- a/README.zh-CN.md
+++ b/README.zh-CN.md
@@ -11,15 +11,25 @@ INI格式内容解析; 使用INI格式作为配置，配置数据的加载，管
 
 > **[EN README](README.md)**
 
+## 功能简介
+
 - 使用简单(获取: `Int` `Int64` `Bool` `String` `StringMap` ..., 设置: `Set` )
 - 支持多文件，数据加载
 - 支持数据覆盖合并
-- 支持将数据重新绑定到结构体
-- 支持解析 ENV 变量名
+- 支持将数据绑定到结构体
+- 支持解析 `ENV` 变量名
 - 支持使用 `;` `#` 注释一行
-- 支持变量参考，默认兼容Python的configParser格式 `%(VAR)s`
+- 支持变量参考引用
+  - 默认兼容 Python 的 configParser 格式 `%(VAR)s`
 - 完善的单元测试(coverage > 90%)
-- 子包 `dotenv` - 提供了加载解析 `.env` 文件数据为ENV环境变量
+
+### [Parser](./parser)
+
+子包 `parser` - 实现了解析 `INI` 格式内容为 Go 数据
+
+### [Dotenv](./dotenv)
+
+子包 `dotenv` - 提供了加载解析 `.env` 文件数据为ENV环境变量
 
 ## 更多格式
 
diff --git a/dotenv/dotenv.go b/dotenv/dotenv.go
index 7e220bb..ac324fe 100644
--- a/dotenv/dotenv.go
+++ b/dotenv/dotenv.go
@@ -169,6 +169,7 @@ func getVal(name string) (val string, ok bool) {
 
 	// cached
 	if val = loadedData[name]; val != "" {
+		ok = true
 		return
 	}
 
diff --git a/dotenv/dotenv_test.go b/dotenv/dotenv_test.go
index adac99b..1b9dc58 100644
--- a/dotenv/dotenv_test.go
+++ b/dotenv/dotenv_test.go
@@ -39,6 +39,7 @@ func TestLoadFiles(t *testing.T) {
 	err := LoadFiles("./testdata/.env")
 
 	assert.NoErr(t, err)
+	assert.NotEmpty(t, LoadedData())
 	assert.Eq(t, "blog", os.Getenv("DONT_ENV_TEST"))
 	assert.Eq(t, "blog", Get("DONT_ENV_TEST"))
 }
diff --git a/parser/options.go b/parser/options.go
index d627590..7f32449 100644
--- a/parser/options.go
+++ b/parser/options.go
@@ -2,8 +2,8 @@ package parser
 
 // mode of parse data
 //
-//	ModeFull   - will parse inline array
-//	ModeLite/ModeSimple - don't parse array value
+//	ModeFull   - will parse array value and inline array
+//	ModeLite/ModeSimple - don't parse array value line
 const (
 	ModeFull   parseMode = 1
 	ModeLite   parseMode = 2
diff --git a/parser/parser.go b/parser/parser.go
index 29ecc95..414f25f 100644
--- a/parser/parser.go
+++ b/parser/parser.go
@@ -39,48 +39,38 @@ import (
 	"regexp"
 	"strings"
 
+	"github.com/gookit/goutil/strutil/textscan"
 	"github.com/mitchellh/mapstructure"
 )
 
-// errSyntax is returned when there is a syntax error in an INI file.
-type errSyntax struct {
-	Line int
-	// Source The contents of the erroneous line, without leading or trailing whitespace
-	Source string
-}
+// match: [section]
+var sectionRegex = regexp.MustCompile(`^\[(.*)]$`)
 
-// Error message return
-func (e errSyntax) Error() string {
-	return fmt.Sprintf("invalid INI syntax on line %d: %s", e.Line, e.Source)
-}
+// TokSection for mark a section
+const TokSection = textscan.TokComments + 1 + iota
 
-var (
-	// match: [section]
-	sectionRegex = regexp.MustCompile(`^\[(.*)]$`)
-	// match: foo[] = val
-	assignArrRegex = regexp.MustCompile(`^([^=\[\]]+)\[][^=]*=(.*)$`)
-	// match: key = val
-	assignRegex = regexp.MustCompile(`^([^=]+)=(.*)$`)
-	// quote ' "
-	quotesRegex = regexp.MustCompile(`^(['"])(.*)(['"])$`)
-)
+// SectionMatcher match section line: [section]
+type SectionMatcher struct{}
 
-// special chars consts
-const (
-	MultiLineValMarkS = "'''"
-	MultiLineValMarkD = `"""`
-)
+// Match section line: [section]
+func (m *SectionMatcher) Match(text string, prev textscan.Token) (textscan.Token, error) {
+	line := strings.TrimSpace(text)
 
-// token consts
-const (
-	TokMLValMarkS = 'm' // multi line value by single quotes: '''
-	TokMLValMarkD = 'M' // multi line value by double quotes: """
-)
+	if matched := sectionRegex.FindStringSubmatch(line); matched != nil {
+		section := strings.TrimSpace(matched[1])
+		tok := textscan.NewStringToken(TokSection, section)
+		return tok, nil
+	}
+
+	return nil, nil
+}
 
 // Parser definition
 type Parser struct {
 	*Options
 	// parsed bool
+	// comments map, key is name
+	comments map[string]string
 
 	// for full parse(allow array, map section)
 	fullData map[string]any
@@ -185,6 +175,7 @@ func (p *Parser) init() {
 	// if p.IgnoreCase {
 	// 	p.DefSection = strings.ToLower(p.DefSection)
 	// }
+	p.comments = make(map[string]string)
 
 	if p.ParseMode == ModeFull {
 		p.fullData = make(map[string]any)
@@ -202,56 +193,72 @@ func (p *Parser) init() {
 }
 
 // ParseFrom a data scanner
-func (p *Parser) ParseFrom(in *bufio.Scanner) (bytes int64, err error) {
+func (p *Parser) ParseFrom(in *bufio.Scanner) (count int64, err error) {
 	p.init()
+	count = -1
+
+	// create scanner
+	ts := textscan.NewScanner(in)
+	ts.AddKind(TokSection, "Section")
+	ts.AddMatchers(
+		&textscan.CommentsMatcher{
+			InlineChars: []byte{'#', ';'},
+		},
+		&SectionMatcher{},
+		&textscan.KeyValueMatcher{
+			MergeComments: true,
+			InlineComment: p.InlineComment,
+		},
+	)
 
-	bytes = -1
-	lineNum := 0
 	section := p.DefSection
 
-	var readOk bool
-	for readOk = in.Scan(); readOk; readOk = in.Scan() {
-		line := in.Text()
-
-		bytes++ // newline
-		bytes += int64(len(line))
+	// scan and parsing
+	for ts.Scan() {
+		tok := ts.Token()
 
-		lineNum++
-		line = strings.TrimSpace(line)
-		if len(line) == 0 { // Skip blank lines
+		// comments has been merged to value token
+		if !tok.IsValid() || tok.Kind() == textscan.TokComments {
 			continue
 		}
 
-		if line[0] == ';' || line[0] == '#' { // Skip comments
+		if tok.Kind() == TokSection {
+			section = tok.Value()
+
+			// collect comments
+			if textscan.IsKindToken(textscan.TokComments, ts.PrevToken()) {
+				p.comments["_sec_"+section] = ts.PrevToken().Value()
+			}
 			continue
 		}
 
-		// array/slice data
-		if matched := assignArrRegex.FindStringSubmatch(line); matched != nil {
-			// skip array parse on lite mode
-			if p.ParseMode == ModeLite {
-				continue
-			}
+		// collect value
+		if tok.Kind() == textscan.TokValue {
+			vt := tok.(*textscan.ValueToken)
 
-			key, val := strings.TrimSpace(matched[1]), trimWithQuotes(matched[2])
+			var isSli bool
+			key := vt.Key()
 
-			p.collectValue(section, key, val, true)
-		} else if matched := assignRegex.FindStringSubmatch(line); matched != nil {
-			key, val := strings.TrimSpace(matched[1]), trimWithQuotes(matched[2])
+			// is array index
+			if strings.HasSuffix(key, "[]") {
+				// skip parse array on lite mode
+				if p.ParseMode == ModeLite {
+					continue
+				}
 
-			p.collectValue(section, key, val, false)
-		} else if matched := sectionRegex.FindStringSubmatch(line); matched != nil {
-			section = strings.TrimSpace(matched[1])
-		} else {
-			err = errSyntax{lineNum, line}
-			return
+				key = key[:len(key)-2]
+				isSli = true
+			}
+
+			p.collectValue(section, key, vt.Value(), isSli)
+			if vt.HasComment() {
+				p.comments[section+"_"+key] = vt.Comment()
+			}
 		}
 	}
 
-	err = in.Err()
-	if bytes < 0 {
-		bytes = 0
-	}
+	count = 0
+	err = ts.Err()
 	return
 }
 
@@ -261,15 +268,11 @@ func (p *Parser) collectValue(section, key, val string, isSlice bool) {
 		section = strings.ToLower(section)
 	}
 
-	if p.InlineComment {
-		val, _ = splitInlineComment(val)
-	}
-
 	if p.ReplaceNl {
 		val = strings.ReplaceAll(val, `\n`, "\n")
 	}
 
-	p.Collector(section, key, val, false)
+	p.Collector(section, key, val, isSlice)
 }
 
 func (p *Parser) collectFullValue(section, key, val string, isSlice bool) {
@@ -305,8 +308,7 @@ func (p *Parser) collectFullValue(section, key, val string, isSlice bool) {
 
 	switch sd := secData.(type) {
 	case map[string]any: // existed section
-		curVal, ok := sd[key]
-		if ok {
+		if curVal, ok := sd[key]; ok {
 			switch cv := curVal.(type) {
 			case string:
 				if isSlice {
@@ -351,22 +353,11 @@ func (p *Parser) collectLiteValue(sec, key, val string, _ bool) {
 	}
 }
 
-func splitInlineComment(val string) (string, string) {
-	if pos := strings.IndexRune(val, '#'); pos > -1 {
-		return strings.TrimRight(val[0:pos], " "), val[pos:]
-	}
-
-	if pos := strings.Index(val, "//"); pos > -1 {
-		return strings.TrimRight(val[0:pos], " "), val[pos:]
-	}
-	return val, ""
-}
-
 /*************************************************************
  * export data
  *************************************************************/
 
-// Decode mapping the parsed data to struct ptr
+// Decode the parsed data to struct ptr
 func (p *Parser) Decode(ptr any) error {
 	return p.MapStruct(ptr)
 }
@@ -436,6 +427,11 @@ func mapStruct(tagName string, data any, ptr any) error {
  * helper methods
  *************************************************************/
 
+// Comments get
+func (p *Parser) Comments() map[string]string {
+	return p.comments
+}
+
 // ParsedData get parsed data
 func (p *Parser) ParsedData() interface{} {
 	if p.ParseMode == ModeFull {
@@ -473,13 +469,3 @@ func (p *Parser) Reset() {
 		p.liteData = make(map[string]map[string]string)
 	}
 }
-
-func trimWithQuotes(inputVal string) (filtered string) {
-	filtered = strings.TrimSpace(inputVal)
-	groups := quotesRegex.FindStringSubmatch(filtered)
-
-	if len(groups) > 2 && groups[1] == groups[3] {
-		filtered = groups[2]
-	}
-	return
-}
diff --git a/parser/parser_test.go b/parser/parser_test.go
index b0caac3..72c77af 100644
--- a/parser/parser_test.go
+++ b/parser/parser_test.go
@@ -1,15 +1,18 @@
 package parser
 
 import (
+	"bufio"
 	"fmt"
+	"strings"
 	"testing"
 
 	"github.com/gookit/goutil/dump"
+	"github.com/gookit/goutil/strutil/textscan"
 	"github.com/gookit/goutil/testutil/assert"
 )
 
 var iniStr = `
-# comments
+# comments 1
 name = inhere
 age = 28
 debug = true
@@ -18,17 +21,18 @@ hasQuota2 = "this is val1"
 shell = ${SHELL}
 noEnv = ${NotExist|defValue}
 
-; array in def section
+; array in default section
 tags[] = a
 tags[] = b
 tags[] = c
 
-; comments
+; comments 2
 [sec1]
 key = val0
 some = value
 stuff = things
-; array in section
+
+; array in section sec1
 types[] = x
 types[] = y
 `
@@ -105,7 +109,7 @@ two_words = abc def
 	is.Eq("[a b]", fmt.Sprintf("%v", data["arr"]))
 	is.Eq("map[key:val]", fmt.Sprintf("%v", data["sec"]))
 
-	st := struct {
+	type myConf struct {
 		Age  int
 		Name string
 		Sec1 struct {
@@ -113,10 +117,27 @@ two_words = abc def
 			Number   int
 			TwoWords string `ini:"two_words"`
 		}
-	}{}
+	}
 
-	is.Nil(Decode(bts, &st))
+	st := &myConf{}
+	is.NoErr(Decode(bts, st))
+	is.Eq(23, st.Age)
+	is.Eq("inhere", st.Name)
+	is.Eq(2020, st.Sec1.Number)
+	is.Eq("abc def", st.Sec1.TwoWords)
 	dump.P(st)
+
+	// Unmarshal
+	p := NewLite(func(opt *Options) {
+		opt.NoDefSection = true
+	})
+
+	st = &myConf{}
+	is.NoErr(p.Unmarshal(bts, st))
+	is.Eq(23, st.Age)
+	is.Eq("inhere", st.Name)
+	is.Eq(2020, st.Sec1.Number)
+	is.Eq("abc def", st.Sec1.TwoWords)
 }
 
 func TestNewSimpled(t *testing.T) {
@@ -131,8 +152,9 @@ func TestNewSimpled(t *testing.T) {
 
 	err := p.ParseString("invalid string")
 	is.Err(err)
-	is.IsType(errSyntax{}, err)
-	is.Contains(err.Error(), "invalid INI syntax on line")
+	is.IsType(textscan.ErrScan{}, err)
+	// is.Contains(err.Error(), "invalid syntax, no matcher available")
+	is.Contains(err.Error(), "line 1: invalid string")
 
 	err = p.ParseString("")
 	is.NoErr(err)
@@ -143,6 +165,7 @@ func TestNewSimpled(t *testing.T) {
 	is.Nil(err)
 
 	data := p.SimpleData()
+	dump.P(data, p.Comments())
 	str := fmt.Sprintf("%v", data)
 	is.Contains(str, "hasQuota2:")
 	is.NotContains(str, "hasquota1:")
@@ -191,7 +214,9 @@ key = val0
 	is.Nil(err)
 
 	v := p.ParsedData()
+	dump.P(v, p.Comments())
 	is.NotEmpty(v)
+	is.ContainsKey(v, "sec1")
 
 	// options: ignore case
 	p = NewFulled(IgnoreCase)
@@ -218,6 +243,13 @@ func TestParser_ParseBytes(t *testing.T) {
 	is.Len(p.LiteData(), 0)
 }
 
+func TestParser_ParseFrom(t *testing.T) {
+	p := New()
+	n, err := p.ParseFrom(bufio.NewScanner(strings.NewReader("")))
+	assert.Eq(t, int64(0), n)
+	assert.NoErr(t, err)
+}
+
 func TestParser_ParseString(t *testing.T) {
 	p := New(WithParseMode(ModeFull))
 	err := p.ParseString(`
@@ -230,4 +262,9 @@ arr[] = val4
 	assert.NoErr(t, err)
 	assert.NotEmpty(t, p.fullData)
 	dump.P(p.ParsedData())
+
+	p.Reset()
+	assert.NoErr(t, p.ParseString(`
+# no values
+`))
 }
diff --git a/testdata/export.ini b/testdata/export.ini
index 81046f2..5d2f7aa 100644
--- a/testdata/export.ini
+++ b/testdata/export.ini
@@ -1,4 +1,4 @@
-# exported at 2022-09-17 14:55:16
+# exported at 2022-10-15 20:43:25
 
 # values for default section
 age = 28