refactor: only check paragraph and sentence when needed

errata-ai · Jun 30, 2020 · 0cd4dfd · 0cd4dfd
1 parent 73350f5
commit 0cd4dfd
Show file tree

Hide file tree

Showing 4 changed files with 50 additions and 19 deletions.
diff --git a/check/check.go b/check/check.go
@@ -45,14 +45,19 @@ type ruleFn func(string, *core.File) []core.Alert
 type Manager struct {
 	AllChecks map[string]Check
 	Config    *core.Config
+	Scopes    map[string]struct{}
 }
 
 // NewManager creates a new Manager and loads the rule definitions (that is,
 // extended checks) specified by config.
 func NewManager(config *core.Config) *Manager {
 	var path string
 
-	mgr := Manager{AllChecks: make(map[string]Check), Config: config}
+	mgr := Manager{
+		AllChecks: make(map[string]Check),
+		Config:    config,
+		Scopes:    make(map[string]struct{}),
+	}
 
 	// loadedStyles keeps track of the styles we've loaded as we go.
 	loadedStyles := []string{}
@@ -811,6 +816,9 @@ func (mgr *Manager) addCheck(file []byte, chkName string) error {
 		builder(chkName, generic, mgr)
 	}
 
+	base := strings.Split(generic["scope"].(string), ".")[0]
+	mgr.Scopes[base] = struct{}{}
+
 	return nil
 }
 

diff --git a/core/core.go b/core/core.go
@@ -287,9 +287,6 @@ func (f *File) ResetComments() {
 	}
 }
 
-// SentenceTokenizer splits text into sentences.
-var SentenceTokenizer = tokenize.NewPunktSentenceTokenizer()
-
 // WordTokenizer splits text into words.
 var WordTokenizer = tokenize.NewRegexpTokenizer(
 	`[\p{L}[\p{N}]+(?:\.\w{2,4}\b)|(?:[A-Z]\.){2,}|[\p{L}[\p{N}]+['-][\p{L}-[\p{N}]+|[\p{L}[\p{N}@]+`, false, true)

diff --git a/core/util.go b/core/util.go
@@ -17,6 +17,7 @@ import (
 
 	"github.com/icza/gox/fmtx"
 	"github.com/jdkato/prose/tag"
+	"github.com/jdkato/prose/v2"
 	"github.com/jdkato/regexp"
 	"github.com/levigross/grequests"
 	"github.com/mholt/archiver"
@@ -117,9 +118,17 @@ func IsPhrase(s string) bool {
 
 // TextToWords convert raw text into a slice of words.
 func TextToWords(text string) []string {
+	nlp, err := prose.NewDocument(text,
+		prose.WithSegmentation(false),
+		prose.WithTagging(false),
+		prose.WithExtraction(false),
+	)
+
 	words := []string{}
-	for _, s := range SentenceTokenizer.Tokenize(text) {
-		words = append(words, strings.Fields(s)...)
+	if CheckError(err, true) {
+		for _, s := range nlp.Tokens() {
+			words = append(words, s.Text)
+		}
 	}
 	return words
 }

diff --git a/lint/lint.go b/lint/lint.go
@@ -38,6 +38,7 @@ import (
 
 	"github.com/errata-ai/vale/check"
 	"github.com/errata-ai/vale/core"
+	"github.com/jdkato/prose/v2"
 	"github.com/remeh/sizedwaitgroup"
 )
 
@@ -243,28 +244,44 @@ func (l *Linter) lintFile(src string) *core.File {
 
 func (l *Linter) lintProse(f *core.File, ctx, txt, raw string, lnTotal, lnLength int) {
 	var b Block
+
 	text := core.PrepText(txt)
 	rawText := core.PrepText(raw)
 	senScope := "sentence" + f.RealExt
-	parScope := "paragraph" + f.RealExt
-	txtScope := "text" + f.RealExt
 	hasCtx := ctx != ""
 
-	// Check if we NEED to do this!
-	for _, p := range strings.SplitAfter(text, "\n\n") {
-		for _, s := range core.SentenceTokenizer.Tokenize(p) {
-			sent := strings.TrimSpace(s)
-			if hasCtx {
-				b = NewBlock(ctx, sent, "", senScope)
-			} else {
-				b = NewBlock(p, sent, "", senScope)
+	if _, has := l.CheckManager.Scopes["paragraph"]; has {
+		for _, p := range strings.SplitAfter(text, "\n\n") {
+			if _, has := l.CheckManager.Scopes["sentence"]; has {
+				doc, _ := prose.NewDocument(p,
+					prose.WithTokenization(false),
+					prose.WithTagging(false),
+					prose.WithExtraction(false),
+				)
+				for _, s := range doc.Sentences() {
+					sent := strings.TrimSpace(s.Text)
+					if hasCtx {
+						b = NewBlock(ctx, sent, "", senScope)
+					} else {
+						b = NewBlock(p, sent, "", senScope)
+					}
+					l.lintText(f, b, lnTotal, lnLength)
+				}
 			}
-			l.lintText(f, b, lnTotal, lnLength)
+			l.lintText(
+				f,
+				NewBlock(ctx, p, "", "paragraph"+f.RealExt),
+				lnTotal,
+				lnLength)
 		}
-		l.lintText(f, NewBlock(ctx, p, "", parScope), lnTotal, lnLength)
+
 	}
 
-	l.lintText(f, NewBlock(ctx, text, rawText, txtScope), lnTotal, lnLength)
+	l.lintText(
+		f,
+		NewBlock(ctx, text, rawText, "text"+f.RealExt),
+		lnTotal,
+		lnLength)
 }
 
 func (l *Linter) lintLines(f *core.File) {