-
Notifications
You must be signed in to change notification settings - Fork 14
/
pre_tokenizer.ex
223 lines (167 loc) Β· 6.9 KB
/
pre_tokenizer.ex
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
defmodule Tokenizers.PreTokenizer do
@moduledoc """
Pre-tokenizers.
A pre-tokenizer takes care of splitting the input according to a set
of rules. This pre-processing lets you ensure that the underlying
model does not build tokens across multiple βsplitsβ. For example
if you donβt want to have whitespaces inside a token, then you can
have a pre-tokenizer that splits on these whitespaces.
You can easily combine multiple pre-tokenizers together using
`sequence/1`.
A pre-tokenizer is also allowed to modify the string, just like a
normalizer does. This is necessary to allow some complicated
algorithms that require to split before normalizing (e.g. ByteLevel).
"""
defstruct [:resource]
@type t() :: %__MODULE__{resource: reference()}
@doc """
Converts a string into a sequence of pre-tokens.
"""
@spec pre_tokenize(t(), String.t()) :: {:ok, [{String.t(), {integer(), integer()}}]}
defdelegate pre_tokenize(pre_tokenizer, input),
to: Tokenizers.Native,
as: :pre_tokenizers_pre_tokenize
@doc """
Creates a ByteLevel pre-tokenizer.
Splits on whitespaces while remapping all the bytes to a set of
visible characters. This technique has been introduced by OpenAI
with GPT-2 and has some more or less nice properties:
* Since it maps on bytes, a tokenizer using this only requires
256 characters as initial alphabet (the number of values a byte
can have), as opposed to the 130,000+ Unicode characters.
* A consequence of the previous point is that it is absolutely
unnecessary to have an unknown token using this since we can
represent anything with 256 tokens (Youhou!! ππ)
* For non ascii characters, it gets completely unreadable, but it
works nonetheless!
## Options
* `:add_prefix_space` - whether to add a space to the first word
if there isnβt already one. This lets us treat hello exactly
like say hello. Defaults to `true`
* `:use_regex` - set this to `false` to prevent this pre-tokenizer
from using the GPT2 specific regexp for splitting on whitespace.
Defaults to `true`
"""
@spec byte_level(keyword()) :: t()
defdelegate byte_level(opts \\ []), to: Tokenizers.Native, as: :pre_tokenizers_byte_level
@doc """
Gets ByteLevel pre-tokenizer's alphabet.
"""
@spec byte_level_alphabet() :: charlist()
defdelegate byte_level_alphabet(),
to: Tokenizers.Native,
as: :pre_tokenizers_byte_level_alphabet
@doc """
Creates a Whitespace pre-tokenizer.
Splits on word boundaries. Uses the following regular expression:
`\w+|[^\w\s]+`.
"""
@spec whitespace() :: t()
defdelegate whitespace(), to: Tokenizers.Native, as: :pre_tokenizers_whitespace
@doc """
Creates a WhitespaceSplit pre-tokenizer.
Splits on any whitespace character.
"""
@spec whitespace_split() :: t()
defdelegate whitespace_split(), to: Tokenizers.Native, as: :pre_tokenizers_whitespace_split
@doc """
Creates a BertPreTokenizer pre-tokenizer.
Splits for use in Bert models.
"""
@spec bert_pre_tokenizer() :: t()
defdelegate bert_pre_tokenizer(), to: Tokenizers.Native, as: :pre_tokenizers_bert
@doc """
Creates Metaspace pre-tokenizer.
Splits on whitespaces and replaces them with a special char βββ
(U+2581).
## Options
* `:replacement` - the replacement character to use. Defaults to `"β"`
* `:prepend_scheme` - whether to add a space to the first word if there
isn't already one. This lets us treat "hello" exactly like "say hello".
Either of `:always`, `:never`, `:first`. `:first` means the space is
only added on the first token (relevant when special tokens are used
or other pre_tokenizer are used). Defaults to `:always`
"""
@spec metaspace(keyword()) :: t()
defdelegate metaspace(opts \\ []), to: Tokenizers.Native, as: :pre_tokenizers_metaspace
@doc """
Creates a CharDelimiterSplit pre-tokenizer.
This pre-tokenizer simply splits on the provided delimiter. Works
almost like simple split function, except that it accounts for
multiple consecutive spaces.
"""
@spec char_delimiter_split(char()) :: t()
defdelegate char_delimiter_split(delimiter),
to: Tokenizers.Native,
as: :pre_tokenizers_char_delimiter_split
@typedoc """
Specifies how delimiter should behave for several pretokenizers.
"""
@type split_delimiter_behaviour() ::
:removed
| :isolated
| :merged_with_previous
| :merged_with_next
| :contiguous
@doc """
Creates a Split pre-tokenizer using a string as split pattern.
Versatile pre-tokenizer that splits on provided pattern and according
to provided behavior.
## Options
* `:invert` - whether to invert the split or not. Defaults to `false`
"""
@spec split(String.t(), split_delimiter_behaviour(), keyword()) :: t()
def split(pattern, behavior, opts \\ []) when is_binary(pattern) do
Tokenizers.Native.pre_tokenizers_split({:string, pattern}, behavior, opts)
end
@doc ~S"""
Creates a Split pre-tokenizer using a regular expression as split pattern.
Versatile pre-tokenizer that splits on provided regex pattern and according
to provided behavior.
The `pattern` should be a string representing a regular expression
according to the [Oniguruma Regex Engine](https://github.com/kkos/oniguruma).
## Options
* `:invert` - whether to invert the split or not. Defaults to `false`
## Example
iex> Tokenizers.PreTokenizer.split_regex(~S(\?\d{2}\?), :removed)
#Tokenizers.PreTokenizer<[pre_tokenizer_type: "Split"]>
"""
@spec split_regex(String.t(), split_delimiter_behaviour(), keyword()) :: t()
def split_regex(pattern, behavior, opts \\ []) when is_binary(pattern) do
Tokenizers.Native.pre_tokenizers_split({:regex, pattern}, behavior, opts)
end
@doc """
Creates a Punctuation pre-tokenizer.
Will isolate all punctuation characters.
"""
@spec punctuation(split_delimiter_behaviour()) :: t()
defdelegate punctuation(behaviour), to: Tokenizers.Native, as: :pre_tokenizers_punctuation
@doc """
Creates a Sequence pre-tokenizer.
Lets you compose multiple pre-tokenizers that will be run in the
given order.
"""
@spec sequence([t()]) :: t()
defdelegate sequence(pre_tokenizers), to: Tokenizers.Native, as: :pre_tokenizers_sequence
@doc """
Creates a Digits pre-tokenizer.
Splits the numbers from any other characters.
## Options
* `:individual_digits` - whether to split individual digits or not.
Defaults to `false`
"""
@spec digits(keyword()) :: t()
defdelegate digits(opts \\ []),
to: Tokenizers.Native,
as: :pre_tokenizers_digits
end
defimpl Inspect, for: Tokenizers.PreTokenizer do
import Inspect.Algebra
def inspect(decoder, opts) do
attrs =
decoder
|> Tokenizers.Native.pre_tokenizers_info()
|> Keyword.new(fn {k, v} -> {String.to_atom(k), v} end)
concat(["#Tokenizers.PreTokenizer<", to_doc(attrs, opts), ">"])
end
end