Skip to content

Commit

Permalink
refactor: only check paragraph and sentence when needed
Browse files Browse the repository at this point in the history
  • Loading branch information
jdkato committed Jun 30, 2020
1 parent 73350f5 commit 0cd4dfd
Show file tree
Hide file tree
Showing 4 changed files with 50 additions and 19 deletions.
10 changes: 9 additions & 1 deletion check/check.go
Expand Up @@ -45,14 +45,19 @@ type ruleFn func(string, *core.File) []core.Alert
type Manager struct {
AllChecks map[string]Check
Config *core.Config
Scopes map[string]struct{}
}

// NewManager creates a new Manager and loads the rule definitions (that is,
// extended checks) specified by config.
func NewManager(config *core.Config) *Manager {
var path string

mgr := Manager{AllChecks: make(map[string]Check), Config: config}
mgr := Manager{
AllChecks: make(map[string]Check),
Config: config,
Scopes: make(map[string]struct{}),
}

// loadedStyles keeps track of the styles we've loaded as we go.
loadedStyles := []string{}
Expand Down Expand Up @@ -811,6 +816,9 @@ func (mgr *Manager) addCheck(file []byte, chkName string) error {
builder(chkName, generic, mgr)
}

base := strings.Split(generic["scope"].(string), ".")[0]
mgr.Scopes[base] = struct{}{}

return nil
}

Expand Down
3 changes: 0 additions & 3 deletions core/core.go
Expand Up @@ -287,9 +287,6 @@ func (f *File) ResetComments() {
}
}

// SentenceTokenizer splits text into sentences.
var SentenceTokenizer = tokenize.NewPunktSentenceTokenizer()

// WordTokenizer splits text into words.
var WordTokenizer = tokenize.NewRegexpTokenizer(
`[\p{L}[\p{N}]+(?:\.\w{2,4}\b)|(?:[A-Z]\.){2,}|[\p{L}[\p{N}]+['-][\p{L}-[\p{N}]+|[\p{L}[\p{N}@]+`, false, true)
Expand Down
13 changes: 11 additions & 2 deletions core/util.go
Expand Up @@ -17,6 +17,7 @@ import (

"github.com/icza/gox/fmtx"
"github.com/jdkato/prose/tag"
"github.com/jdkato/prose/v2"
"github.com/jdkato/regexp"
"github.com/levigross/grequests"
"github.com/mholt/archiver"
Expand Down Expand Up @@ -117,9 +118,17 @@ func IsPhrase(s string) bool {

// TextToWords convert raw text into a slice of words.
func TextToWords(text string) []string {
nlp, err := prose.NewDocument(text,
prose.WithSegmentation(false),
prose.WithTagging(false),
prose.WithExtraction(false),
)

words := []string{}
for _, s := range SentenceTokenizer.Tokenize(text) {
words = append(words, strings.Fields(s)...)
if CheckError(err, true) {
for _, s := range nlp.Tokens() {
words = append(words, s.Text)
}
}
return words
}
Expand Down
43 changes: 30 additions & 13 deletions lint/lint.go
Expand Up @@ -38,6 +38,7 @@ import (

"github.com/errata-ai/vale/check"
"github.com/errata-ai/vale/core"
"github.com/jdkato/prose/v2"
"github.com/remeh/sizedwaitgroup"
)

Expand Down Expand Up @@ -243,28 +244,44 @@ func (l *Linter) lintFile(src string) *core.File {

func (l *Linter) lintProse(f *core.File, ctx, txt, raw string, lnTotal, lnLength int) {
var b Block

text := core.PrepText(txt)
rawText := core.PrepText(raw)
senScope := "sentence" + f.RealExt
parScope := "paragraph" + f.RealExt
txtScope := "text" + f.RealExt
hasCtx := ctx != ""

// Check if we NEED to do this!
for _, p := range strings.SplitAfter(text, "\n\n") {
for _, s := range core.SentenceTokenizer.Tokenize(p) {
sent := strings.TrimSpace(s)
if hasCtx {
b = NewBlock(ctx, sent, "", senScope)
} else {
b = NewBlock(p, sent, "", senScope)
if _, has := l.CheckManager.Scopes["paragraph"]; has {
for _, p := range strings.SplitAfter(text, "\n\n") {
if _, has := l.CheckManager.Scopes["sentence"]; has {
doc, _ := prose.NewDocument(p,
prose.WithTokenization(false),
prose.WithTagging(false),
prose.WithExtraction(false),
)
for _, s := range doc.Sentences() {
sent := strings.TrimSpace(s.Text)
if hasCtx {
b = NewBlock(ctx, sent, "", senScope)
} else {
b = NewBlock(p, sent, "", senScope)
}
l.lintText(f, b, lnTotal, lnLength)
}
}
l.lintText(f, b, lnTotal, lnLength)
l.lintText(
f,
NewBlock(ctx, p, "", "paragraph"+f.RealExt),
lnTotal,
lnLength)
}
l.lintText(f, NewBlock(ctx, p, "", parScope), lnTotal, lnLength)

}

l.lintText(f, NewBlock(ctx, text, rawText, txtScope), lnTotal, lnLength)
l.lintText(
f,
NewBlock(ctx, text, rawText, "text"+f.RealExt),
lnTotal,
lnLength)
}

func (l *Linter) lintLines(f *core.File) {
Expand Down

0 comments on commit 0cd4dfd

Please sign in to comment.