-
Notifications
You must be signed in to change notification settings - Fork 2
/
edgengram.go
138 lines (131 loc) · 4.97 KB
/
edgengram.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
package gorean
import (
"fmt"
)
// GenerateEdgeNGramTokens is for creating forward match at service
func GenerateEdgeNGramTokens(str string) ([]string, error) {
var tokens []string
var stackedKeywords []string
argRunes := []rune(str)
for alphabetPos := range argRunes {
var stack []rune
for i := 0; i <= alphabetPos; i++ {
stack = append(stack, argRunes[i])
}
stackedKeywords = append(stackedKeywords, string(stack))
}
splits, err := Split(str, SplitOptBasic)
if err != nil {
return tokens, err
}
var previousSplit []string
for splitIndex, split := range splits {
stackedKeyword := ""
if splitIndex >= 1 {
stackedKeyword = stackedKeywords[splitIndex-1]
previousSplit = splits[splitIndex-1]
}
lenSplit := len(split)
for splitPos := 0; splitPos < lenSplit; splitPos++ {
// 초성 일 때, whitespace 가 아니며, 한글 일 경우
if splitPos == 0 && split[0] != " " && isKoreanHex(int64(argRunes[splitIndex])) {
chosung := split[0]
if coupleJaumFirst := koreanCoupleJaumFirstMap[chosung]; coupleJaumFirst != "" {
chosung = coupleJaumFirst
tokens = append(tokens, stackedKeyword+chosung)
}
lenPreviousSplit := len(previousSplit)
if lenPreviousSplit == 2 {
alphabets := append(previousSplit, chosung)
if joinAlphabet, joinAlphabetErr := JoinTokens(alphabets); joinAlphabetErr != nil {
// Allow to skip
fmt.Printf("이전글자 초성 중성과 현재 글자의 초성 조합 중 토큰조합 실패: %s, %s\n%v\n", str, alphabets, joinAlphabetErr)
} else {
tokens = append(tokens, string(argRunes[0:splitIndex-1])+joinAlphabet)
}
} else if lenPreviousSplit == 3 {
if coupleJaum := koreanCoupleJaumMap[previousSplit[2]+chosung]; coupleJaum != "" {
alphabets := []string{previousSplit[0], previousSplit[1], coupleJaum}
if koreanDoubleJaumMap[alphabets[2]] != true {
if joinAlphabet, joinAlphabetErr := JoinTokens(alphabets); joinAlphabetErr != nil {
// Allow to skip
fmt.Printf("이전글자의 종성과 현재글자의 초성 조합 중 토큰조합 실패: %s, %s\n%v\n", str, alphabets, joinAlphabetErr)
} else {
tokens = append(tokens, string(argRunes[0:splitIndex-1])+joinAlphabet)
}
}
}
}
} else if splitPos == 1 {
// Jungsung - add single Moum when it couple Moum token
jungsung := split[1]
if coupleMoumFirst := koreanCoupleMoumFirstMap[jungsung]; coupleMoumFirst != "" {
alphabets := []string{split[0], coupleMoumFirst}
if joinAlphabet, joinAlphabetErr := JoinTokens(alphabets); joinAlphabetErr != nil {
// Allow skip
fmt.Printf("조합 중성에 대한 분리 중 토큰조합 실패: %s, %s\n%v\n", str, alphabets, joinAlphabetErr)
} else {
tokens = append(tokens, stackedKeyword+joinAlphabet)
}
}
} else if splitPos == 2 {
// Jongsung - add single Jaum when it couple Jaum token
jongsung := split[2]
if coupleJaumFirst := koreanCoupleJaumFirstMap[jongsung]; coupleJaumFirst != "" {
alphabets := []string{split[0], split[1], coupleJaumFirst}
if joinAlphabet, joinAlphabetErr := JoinTokens(alphabets); joinAlphabetErr != nil {
// Allow to skip
fmt.Printf("조합 종성에 대한 분리로직 중 토큰조합 실패: %s, %s\n%v\n", str, alphabets, joinAlphabetErr)
} else {
tokens = append(tokens, stackedKeyword+joinAlphabet)
}
}
}
tokens = basicGenerateToken(tokens, split, &stackedKeyword, splitPos, &str)
//// base - generate token
//var stack []string
//token := stackedKeyword
//for subSplitPos := 0; subSplitPos <= splitPos; subSplitPos++ {
// stack = append(stack, split[subSplitPos])
//}
//if splitPos == 0 {
// token += split[splitPos]
//} else {
// if joinToken, joinTokenErr := JoinTokens(stack); joinTokenErr != nil {
// // Allow to skip
// fmt.Printf("조건 없는 split을 한 token의 조합 중 토큰조합 실패: %s, %s\n%v\n", str, stack, joinTokenErr)
// } else {
// token += joinToken
// }
//}
//tokens = append(tokens, token)
}
}
return tokens, nil
}
// basicGenerateToken is able to get basic ngram tokens
// tokens is result array
// split is strings about a single character
// stackedKeyword is single ngram item
// splitPos is one of Chosung, Jungsung and Jongsung
// str is whole string of target string
func basicGenerateToken(tokens []string, split []string, stackedKeyword *string, splitPos int, str *string) []string {
// base - generate token
var stack []string
token := *stackedKeyword
for subSplitPos := 0; subSplitPos <= splitPos; subSplitPos++ {
stack = append(stack, split[subSplitPos])
}
if splitPos == 0 {
token += split[splitPos]
} else {
if joinToken, joinTokenErr := JoinTokens(stack); joinTokenErr != nil {
// Allow to skip
fmt.Printf("조건 없는 split을 한 token의 조합 중 토큰조합 실패: %s, %s\n%v\n", *str, stack, joinTokenErr)
} else {
token += joinToken
}
}
tokens = append(tokens, token)
return tokens
}