-
Notifications
You must be signed in to change notification settings - Fork 206
/
words.go
75 lines (70 loc) · 2.19 KB
/
words.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
// Copyright 2016 Documize Inc. <legal@documize.com>. All rights reserved.
//
// This software (Documize Community Edition) is licensed under
// GNU AGPL v3 http://www.gnu.org/licenses/agpl-3.0.en.html
//
// You can operate outside the AGPL restrictions by purchasing
// Documize Enterprise Edition and obtaining a commercial license
// by contacting <sales@documize.com>.
//
// https://documize.com
package utility
import "unicode"
import nethtml "golang.org/x/net/html"
// Words returns a slice of words, where each word contains no whitespace, and each item of punctuation is its own word.
// This functionality is provided to enable verification of the text extraction algorithem across different implemntations.
func Words(ch HTML, inSqBr int, testMode bool) ([]string, int, error) {
txt, err := ch.Text(testMode)
if err != nil {
return nil, inSqBr, err
}
txt = nethtml.UnescapeString(txt)
words := []string{""}
for _, c := range txt {
if inSqBr > 0 {
switch c {
case ']':
inSqBr--
case '[':
inSqBr++
}
} else {
if c == rune(0x200B) { // http://en.wikipedia.org/wiki/Zero-width_space
if testMode {
c = ' ' // NOTE only replace with a space here if we are testing
}
}
if c != rune(0x200B) { // http://en.wikipedia.org/wiki/Zero-width_space
if c == '[' {
inSqBr = 1
words = append(words, "[") // open square bracket means potentially elided text
words = append(words, "")
} else {
inSqBr = 0
if unicode.IsPunct(c) || unicode.IsSymbol(c) || unicode.IsDigit(c) {
if words[len(words)-1] == "" {
words[len(words)-1] = string(c)
} else {
words = append(words, string(c))
}
words = append(words, "")
} else {
if unicode.IsGraphic(c) || unicode.IsSpace(c) {
if unicode.IsSpace(c) {
if words[len(words)-1] != "" {
words = append(words, "")
}
} else {
words[len(words)-1] += string(c)
}
}
}
}
}
}
}
if !testMode { // add dummy punctuation if not in test mode to avoid incorrect sentance concatanation
words = append(words, ".")
}
return append(words, ""), inSqBr, nil // make sure there is always a blank entry at the end
}