-
Notifications
You must be signed in to change notification settings - Fork 0
/
tokenize.go
executable file
·366 lines (328 loc) · 10.2 KB
/
tokenize.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
// Copyright 2019 Melvin Davis<hi@melvindavis.me>. All rights reserved.
// Use of this source code is governed by a Melvin Davis<hi@melvindavis.me>
// license that can be found in the LICENSE file.
package interpreter
import (
"errors"
"fmt"
"sort"
"strconv"
"strings"
"time"
goahocorasick "github.com/anknown/ahocorasick"
"github.com/cuttle-ai/octopus/datetime"
)
/*
* This file contains the utilities for tokenizing a sentence.
*/
//Tokenize will tokenize a given sentence according to the tokenizer of the given id
func Tokenize(id string, sentence []rune) ([]FastToken, error) {
/*
* We will initiate the datetime service
* Then we will prepcache the dictionary for the id
* We will make a request to the tokenizer to get the sentence tokenized
* Then we will build the unknowns
* Then we will adjust the date nodes
* Then we will adjust the postions
* Then we will do a fast token for all the tokens and return the same
*/
//start checking for the dates
ch, err := StartCheckingForDates(sentence)
if err != nil {
//error while checking for the dates
return nil, err
}
//precaching
dictReq := DICTRequest{ID: id, Type: DICTPreCache, Out: make(chan DICTRequest)}
SendDICTToChannel(DICTInputChannel, dictReq)
<-dictReq.Out
//requesting the tokenizer to tokenize the given sentence
req := Request{
ID: id,
Type: TokenizerGet,
Sentence: sentence,
Out: make(chan Request),
}
go SendTokenizerToChannel(TokenizerInputChannel, req)
res := <-req.Out
if !res.Valid {
return nil, errors.New("couldn't tokenize the given sentence for the id " + id)
}
//adjusting the date fields
//adding the date fields before adding unknowns is important as the
//positions still refers to the original position in the sentence
res.Matches = BuildTimeNodes(res.Matches, ch)
//building the unknowns
res.Matches = BuildUnknowns(sentence, res.Matches)
//adjusting the positions of the tokens according to the position in the token list
res.Matches = AdjustPositions(res.Matches)
//fast tokenizing the tokens
result := []FastToken{}
for _, v := range res.Matches {
result = append(result, v.FastToken())
}
return result, nil
}
//RequestType is the type of the request for the tokenizer
type RequestType uint
const (
//TokenizerAdd adds a tokenizer for the given id
TokenizerAdd RequestType = 1
//TokenizerGet returns the tokenizer of a given id
TokenizerGet RequestType = 2
//TokenizerRemove the tokenizer from the cache
TokenizerRemove RequestType = 3
)
//Tokenizer has the tokens map and machine for storing the state of the tokens trie
type Tokenizer struct {
//Machine has the machine storing the trie
Machine *goahocorasick.Machine
//map has the tokens mapped to their word
Map map[string]Token
}
//Request can be used to make a request to tokenizer cache
type Request struct {
//ID to which the tokenizer belong to
ID string
//Type is the type of the tokenizer request. It can have Add, Get, Remove
Type RequestType
//Tokenizer is the tokenizer under watch
Tokenizer Tokenizer
//Sentence is the sentence to be tokenized
Sentence []rune
//Valid indicates whethe the result is valid or not
Valid bool
//matches returns the matched tokens
Matches []Token
//Out channel for sending response to the requester
Out chan Request
}
//TokenizerInputChannel is the input channel to communicate with the cache
var TokenizerInputChannel chan Request
func init() {
TokenizerInputChannel = make(chan Request)
go Cache(TokenizerInputChannel)
}
//SendTokenizerToChannel sends a dict request to the channel. This function is to be used with go routines so that
//tokenizer isn't blocked by the requests
func SendTokenizerToChannel(ch chan Request, req Request) {
ch <- req
}
//Cache is the cache for providing the tokenizer to the platform on demand
func Cache(in chan Request) {
/*
* We will go into an infinte loop
* Will wait for the requests to come through the channel
* Based on the type of the request we will remove them from memory
*/
dict := make(map[string]Tokenizer)
for {
req := <-in
switch req.Type {
case TokenizerAdd:
m := new(goahocorasick.Machine)
d := [][]rune{}
for word := range req.Tokenizer.Map {
if len(word) == 0 {
continue
}
d = append(d, []rune(strings.ToLower(word)))
}
if err := m.Build(d); err != nil {
fmt.Println(err)
break
}
dict[req.ID] = Tokenizer{Map: req.Tokenizer.Map, Machine: m}
break
case TokenizerGet:
t, mOk := dict[req.ID]
if !mOk {
req.Valid = false
go SendTokenizerToChannel(req.Out, req)
break
}
terms := t.Machine.MultiPatternSearch([]rune(strings.ToLower(string(req.Sentence))), false)
result := []Token{}
for _, term := range terms {
tok, ok := t.Map[string(term.Word)]
if ok {
result = append(result, Token{Pos: term.Pos, Word: term.Word, Nodes: tok.Nodes})
}
}
req.Matches = result
req.Valid = true
go SendTokenizerToChannel(req.Out, req)
break
case TokenizerRemove:
delete(dict, req.ID)
}
}
}
//BuildUnknowns build unknown nodes.
//It return the tokens with unidentified words in the sentence transfomed to tokens with unknowns
func BuildUnknowns(sentence []rune, toks []Token) []Token {
/*
* We will iterate through the sentence
* Till the positions of each token is reached we will itearte through the sentence and build each word
* Upon reaching a position, it will add the word to the words list
* Once all the unknown words are recorded, we will split each word based on space
* The resultant words are consolidated as tokens
*/
result := []Token{}
words := [][]rune{}
tok := 0
word := []rune{}
tokenMap := map[string]Token{}
//iterating through the sentence
for p := 0; p < len(sentence); p++ {
//checking whether the rune is outside the token
if (len(toks) > tok && p < toks[tok].Pos) || len(toks) <= tok {
word = append(word, sentence[p])
} else if len(toks) > tok && toks[tok].Pos == p {
//need to append only if there exist a word
if len(word) > 0 {
words = append(words, word)
word = []rune{}
}
tokenMap[string(toks[tok].Word)] = toks[tok]
words = append(words, toks[tok].Word)
p = toks[tok].Pos + len(toks[tok].Word)
tok++
}
}
//if any pending word, copy it to the words list
if len(word) > 0 {
words = append(words, word)
}
//split the words with space
for i, w := range words {
// ws := strings.Split(string(w), " ")
// for i, wr := range ws {
//no need to take empty strings
if len(w) == 0 {
continue
}
tok := Token{
Word: []rune(w),
}
oTok, ok := tokenMap[string(w)]
if ok {
tok.Nodes = oTok.Nodes
} else {
tok.Nodes = []Node{&UnknownNode{UID: fmt.Sprint("U", i), Word: []rune(w)}}
}
result = append(result, tok)
// }
}
return result
}
//AdjustPositions will adjust the positions of the tokens corresponding to the
//position of the token in the token list rather than the sentence
func AdjustPositions(toks []Token) []Token {
/*
* We will iterate through the tokens and put the position as the index of the token
*/
result := []Token{}
for k, tok := range toks {
result = append(result, Token{Pos: k, Word: tok.Word, Nodes: tok.Nodes})
}
return result
}
//StartCheckingForDates will initaite the service which starts checking for the date/time presence in the query
func StartCheckingForDates(sentence []rune) (chan datetime.Results, error) {
ser, err := datetime.DefaultService()
if err != nil {
return nil, errors.New("Error while getting the date service" + err.Error())
}
return ser.Query(sentence), nil
}
type timeResults []datetime.Response
func (r timeResults) Len() int { return len(r) }
func (r timeResults) Swap(i, j int) { r[i], r[j] = r[j], r[i] }
func (r timeResults) Less(i, j int) bool { return r[i].Start < r[j].Start }
//BuildTimeNodes will insert time nodes if a valid response from date service is received for its place
//This function won't replace any existing tokens. If conflict between existing node and time node come,
// the time node will be skipped with priority given to the existing node.
func BuildTimeNodes(toks []Token, ch chan datetime.Results) []Token {
/*
* Then we will wait for the channel to dump results
* We will check whether the results are valid or not
* Then we will process it
*/
//wait for the channel to dump response
result := &datetime.Results{}
select {
case res := <-ch:
result = &res
case <-time.After(3 * time.Second):
break
}
//checking whether the results are valid or not
if !result.IsValid() {
return toks
}
//processing the result
//first we will sort the valid results according to the start and end position
//then we will insert the tokens for time nodes in between the tokens
validResults := []datetime.Response{}
for _, resp := range result.Res {
if resp.IsValid() {
validResults = append(validResults, resp)
}
}
sort.Sort(timeResults(validResults))
i, j := 0, 0
for i < len(validResults) {
result := validResults[i]
startIndex := result.Start
endIndex := result.End
//if the end index is < the token's position, then we can
//insert the result as a time node to the tokens
if endIndex < toks[j].Pos {
tok := Token{
Pos: startIndex,
Word: []rune(result.Body),
Nodes: []Node{
&TimeNode{
UID: "T" + strconv.Itoa(i),
Word: []rune(result.Value.Value),
Value: result.Value,
},
},
}
toks = append(toks[:j], append([]Token{tok}, toks[j:]...)...)
i++
} else if startIndex <= toks[j].Pos {
//if the time result has intersected a known node
//then priority is for the known know node, so we skip and move ahead
i++
j++
} else if startIndex > toks[j].Pos {
//we haven't reached the relevant position in the token
j++
}
if j >= len(toks) {
//we have exhausted the token list now we will append
//the remianing time results to the tokens list
timeNodes := []Token{}
for i < len(validResults) {
result = validResults[i]
startIndex = result.Start
timeNodes = append(timeNodes, Token{
Pos: startIndex,
Word: []rune(result.Body),
Nodes: []Node{
&TimeNode{
UID: "T" + strconv.Itoa(i),
Word: []rune(result.Body),
Value: result.Value,
},
},
})
i++
}
toks = append(toks, timeNodes...)
}
}
return toks
}