-
-
Notifications
You must be signed in to change notification settings - Fork 140
/
words.go
121 lines (99 loc) · 3 KB
/
words.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
package spell
import (
"strings"
"unicode"
"github.com/jdkato/regexp"
)
// number form, may include dots, commas and dashes
var numberRegexp = regexp.MustCompile("^([0-9]+[.,-]?)+$")
// number form with units, e.g. 123ms, 12in 1ft
var numberUnitsRegexp = regexp.MustCompile("^[0-9]+[a-zA-Z]+$")
// 0x12FF or 0x1B or x12FF
// does anyone use 0XFF ??
var numberHexRegexp = regexp.MustCompile("^0?[x][0-9A-Fa-f]+$")
var numberBinaryRegexp = regexp.MustCompile("^0[b][01]+$")
var camelCaseRegexp1 = regexp.MustCompile("[A-Z]+")
var shaHashRegexp = regexp.MustCompile("^[0-9a-z]{40}$")
// Splitter splits a text into words
// Highly likely this implementation will change so we are encapsulating.
type Splitter struct {
fn func(c rune) bool
}
// Split is the function to split an input into a `[]string`
func (s *Splitter) Split(in string) []string {
return strings.FieldsFunc(in, s.fn)
}
// NewSplitter creates a new splitter. The input is a string in
// UTF-8 encoding. Each rune in the string will be considered to be a
// valid word character. Runes that are NOT here are deemed a word
// boundary Current implementation uses
// https://golang.org/pkg/strings/#FieldsFunc
func NewSplitter(chars string) *Splitter {
s := Splitter{}
s.fn = (func(c rune) bool {
// break if it's not a letter, and not another special character
return !unicode.IsLetter(c) && -1 == strings.IndexRune(chars, c)
})
return &s
}
func isNumber(s string) bool {
return numberRegexp.MatchString(s)
}
func isNumberBinary(s string) bool {
return numberBinaryRegexp.MatchString(s)
}
// is word in the form of a "number with units", e.g. "101ms", "3ft",
// "5GB" if true, return the units, if not return empty string This is
// highly English based and not sure how applicable it is to other
// languages.
func isNumberUnits(s string) string {
// regexp.FindAllStringSubmatch is too confusing
if !numberUnitsRegexp.MatchString(s) {
return ""
}
// Starts with a number
for idx, ch := range s {
if ch >= '0' && ch <= '9' {
continue
}
return s[idx:]
}
panic("assertion failed")
}
func isNumberHex(s string) bool {
return numberHexRegexp.MatchString(s)
}
func isHash(s string) bool {
return shaHashRegexp.MatchString(s)
}
func splitCamelCase(s string) []string {
out := []string{}
s = strings.Replace(s, "HTTP", "Http", -1)
s = strings.Replace(s, "HTML", "Html", -1)
s = strings.Replace(s, "URL", "Url", -1)
s = strings.Replace(s, "URI", "Uri", -1)
caps := camelCaseRegexp1.FindAllStringIndex(s, -1)
// all lower case
if len(caps) == 0 {
return nil
}
// is only first character capitalized? or is the whole word capitalized
if len(caps) == 1 && caps[0][0] == 0 && (caps[0][1] == 1 || caps[0][1] == len(s)) {
return nil
}
last := 0
for i := 0; i < len(caps); i++ {
if last != caps[i][0] {
out = append(out, s[last:caps[i][0]])
last = caps[i][0]
}
if caps[i][1]-caps[i][0] > 1 {
out = append(out, s[caps[i][0]:caps[i][1]])
last = caps[i][1]
}
}
if last < len(s) {
out = append(out, s[last:])
}
return out
}