/
normalize.go
153 lines (143 loc) · 4.08 KB
/
normalize.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
// Copyright 2019 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package old
import (
"strings"
"unicode"
"unicode/utf8"
)
const (
blankID = -1
unknownWordID = -2
)
// htmlesc unescapes HTML escapes that we've observed,
// especially in Markdown-formatted licenses.
// The replacements must have the same length as the original strings
// to preserve byte offsets.
var htmlesc = strings.NewReplacer(
"“", " \" ",
"”", " \" ",
"&", " & ",
)
// normalize turns the input byte slice into a slice of normalized words
// as a document, including the indexes required to recover the original.
// Normalized text is all lower case, stripped of punctuation and space.
// The slice of normalized words is a slice of indexes into c.words,
// which is updated to add new words as needed.
// Using integer indexes makes the comparison against input texts faster.
func (c *Checker) normalize(data []byte, updateDict bool) *document {
var r rune
var wid int
pos := 0
str := toLower(data)
str = htmlesc.Replace(str)
next := func() {
r, wid = utf8.DecodeRuneInString(str[pos:])
pos += wid
}
words := make([]int32, 0, 100)
indexes := make([]int32, 0, 100)
// Each iteration adds a word.
for pos < len(str) {
start := pos
const blank = "___" // fill in the blank wildcard
if strings.HasPrefix(str[pos:], blank) {
words = append(words, blankID)
indexes = append(indexes, int32(start))
pos += len(blank)
continue
}
next()
// Skip spaces, punctuation, etc. and keep only word characters.
if !isWordChar(r) {
continue
}
// Now at start of word.
for pos < len(str) {
next()
if !isWordChar(r) {
pos -= wid // Will skip r next time around.
break
}
}
if pos > start {
// Is it a list marker? Longest one is maxListMarkerLength bytes: "viii".
if pos-start > maxListMarkerLength || !isListMarker(str[start:pos], r) { // If at EOF, r will not be valid punctuation
word := str[start:pos]
w, ok := c.dict[word]
if !ok {
if updateDict {
w = int32(len(c.words))
c.words = append(c.words, word)
c.dict[word] = w
} else {
w = unknownWordID
}
}
words = append(words, w)
indexes = append(indexes, int32(start))
}
}
}
return &document{
text: data,
words: words,
byteOff: indexes,
}
}
// toLower returns a lowercased version of the input, guaranteeing
// that the size remains the same so byte offsets between the slice and
// the string created from it, which will be used to locate words, will
// line up. TODO: There is a proposal in Go to provide a UTF-8 handler
// that would make this nicer. Use it if it arrives.
// https://github.com/golang/go/issues/25805
func toLower(b []byte) string {
var s strings.Builder
for i, wid := 0, 0; i < len(b); i += wid {
var r rune
r, wid = utf8.DecodeRune(b[i:])
if r == utf8.RuneError && wid == 1 {
// Trouble. Just copy one byte and make it ASCII.
s.WriteByte('?')
continue
}
l := unicode.ToLower(r)
if utf8.RuneLen(l) != wid {
// More trouble. Just use the original.
l = r
}
s.WriteRune(l)
}
return s.String()
}
// isWordChar reports whether r is valid in a word. That means it must
// be a letter, although that definition may change. The rune has already
// been case lowered, although that doesn't matter here.
func isWordChar(r rune) bool {
return unicode.IsLetter(r)
}
const maxListMarkerLength = 4
var listMarker = func() map[string]bool {
const allListMarkers = "a b c d e f g h i j k l m n o p q r ii iii iv vi vii viii ix xi xii xiii xiv xv"
l := map[string]bool{}
for _, marker := range strings.Split(allListMarkers, " ") {
if len(marker) > maxListMarkerLength {
panic("marker too long")
}
l[marker] = true
}
return l
}()
// isListMarker reports whether s, followed immediately by nextRune, is a potential
// list marker such as "i." or "a)".
func isListMarker(s string, nextRune rune) bool {
if !listMarker[s] {
return false
}
switch nextRune {
case '.', ':', ')':
return true
}
return false
}