generated from discourse/discourse-plugin-skeleton
/
basic_tokenizer.rb
39 lines (32 loc) · 1.09 KB
/
basic_tokenizer.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
# frozen_string_literal: true
module DiscourseAi
module Tokenizer
class BasicTokenizer
class << self
def tokenizer
raise NotImplementedError
end
def tokenize(text)
tokenizer.encode(text).tokens
end
def size(text)
tokenize(text).size
end
def truncate(text, max_length)
# fast track common case, /2 to handle unicode chars
# than can take more than 1 token per char
return text if !SiteSetting.ai_strict_token_counting && text.size < max_length / 2
tokenizer.decode(tokenizer.encode(text).ids.take(max_length))
end
def can_expand_tokens?(text, addition, max_length)
# fast track common case, /2 to handle unicode chars
# than can take more than 1 token per char
if !SiteSetting.ai_strict_token_counting && text.size + addition.size < max_length / 2
return true
end
tokenizer.encode(text).ids.length + tokenizer.encode(addition).ids.length < max_length
end
end
end
end
end