/
normalize.go
53 lines (46 loc) · 1.13 KB
/
normalize.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
package indexer
import "strings"
func init() {
accentsMap = make(map[rune]rune)
for _, v := range accents {
for _, r := range v.runes {
accentsMap[r] = v.rep
}
}
}
func replaceAccentsAndApostrophe(s string) string {
runes := []rune(s)
for i, c := range runes {
if c >= 768 && c <= 879 {
continue // skip composed accent
}
rep, ok := accentsMap[c]
if ok {
runes[i] = rep
}
}
return string(runes)
}
func trimApostrophes(s string) string {
return strings.Trim(s, "'")
}
func normalizeWord(s string) string {
return trimApostrophes(replaceAccentsAndApostrophe(strings.ToLower(s)))
}
var accentsMap map[rune]rune // will be filled on init
var accents = []struct {
runes []rune
rep rune
}{
{[]rune{'à', 'á', 'â', 'ã', 'ä', 'å'}, 'a'},
{[]rune{'æ'}, 'a'}, // ae, but we need one rune
{[]rune{'ç'}, 'c'},
{[]rune{'è', 'é', 'ê', 'ë'}, 'e'},
{[]rune{'ì', 'í', 'î', 'ï'}, 'i'},
{[]rune{'ñ'}, 'n'},
{[]rune{'ò', 'ó', 'ô', 'õ', 'ö'}, 'o'},
{[]rune{'œ'}, 'o'}, // oe, but we need one rune
{[]rune{'ù', 'ú', 'û', 'ü'}, 'u'},
{[]rune{'ý', 'ÿ'}, 'y'},
{[]rune{'’'}, '\''}, // apostrophe
}