In [1]:
from tokenizers import Tokenizer, models, Encoding, pre_tokenizers, decoders, Regex, CharBPETokenizer, trainers
import json


with open("../../config/symbols.json", "r") as f:
    vocab_list: list[str] = json.loads(f.read())
special_tokens = ["<PAD>", "<SOS>", "<EOS>", "<UNK>", "<MASK>"]
tokenizer: Tokenizer = CharBPETokenizer(unk_token="<UNK>")

tokenizer.add_special_tokens(special_tokens)
tokenizer.add_tokens(special_tokens)
tokenizer.add_tokens(vocab_list)

trainer = trainers.WordLevelTrainer(special_tokens=special_tokens)

tokenizer.save("../../config/input_tokenizer.json")
with open("../../config/input_tokenizer.json", "r") as f:
    obj = json.loads(f.read())
    v: dict[str, int] = tokenizer.get_vocab()
    obj["model"]["vocab"] = v
with open("../../config/input_tokenizer.json", "w") as f:
    f.write(json.dumps(obj, ensure_ascii=False, indent=2))

### Load generated token.

In [2]:
tokenizer: Tokenizer = Tokenizer.from_file("../../config/input_tokenizer.json")

In [3]:
text = "<EOS><SOS><PAD> Helloياخشىمۇسىز، قانداق ئەھۋالىڭىز؟ ئايدىڭلاشتۇرالمايۋاتقانلىقىڭىزدىنمىكىنتاڭ ئايدىڭلاشتۇرالمايۋاتقاندەك قىلىسىز."
encoded: Encoding = tokenizer.encode(text)
print(f"ids: {encoded.ids}")
print(f"encoded: {encoded.tokens}")
decoded: str = tokenizer.decode(encoded.ids)
print(f"len of tokens: {len(encoded.ids)}")
print(f"Source : {text}")
print(f"Decoded: {decoded}")

ids: [2, 1, 0, 51, 3, 68, 75, 75, 78, 36, 5, 12, 18, 35, 26, 30, 17, 35, 15, 39, 51, 21, 5, 27, 13, 5, 21, 51, 37, 6, 28, 33, 5, 25, 35, 24, 35, 15, 40, 51, 37, 5, 36, 13, 35, 24, 25, 5, 18, 9, 30, 14, 5, 25, 26, 5, 36, 33, 5, 9, 21, 5, 27, 25, 35, 21, 35, 24, 35, 15, 13, 35, 27, 26, 35, 22, 35, 27, 9, 5, 24, 51, 37, 5, 36, 13, 35, 24, 25, 5, 18, 9, 30, 14, 5, 25, 26, 5, 36, 33, 5, 9, 21, 5, 27, 13, 6, 22, 51, 21, 35, 25, 35, 17, 35, 15, 90]
encoded: ['<EOS>', '<SOS>', '<PAD>', ' ', '<UNK>', 'e', 'l', 'l', 'o', 'ي', 'ا', 'خ', 'ش', 'ى', 'م', 'ۇ', 'س', 'ى', 'ز', '،', ' ', 'ق', 'ا', 'ن', 'د', 'ا', 'ق', ' ', 'ئ', 'ە', 'ھ', 'ۋ', 'ا', 'ل', 'ى', 'ڭ', 'ى', 'ز', '؟', ' ', 'ئ', 'ا', 'ي', 'د', 'ى', 'ڭ', 'ل', 'ا', 'ش', 'ت', 'ۇ', 'ر', 'ا', 'ل', 'م', 'ا', 'ي', 'ۋ', 'ا', 'ت', 'ق', 'ا', 'ن', 'ل', 'ى', 'ق', 'ى', 'ڭ', 'ى', 'ز', 'د', 'ى', 'ن', 'م', 'ى', 'ك', 'ى', 'ن', 'ت', 'ا', 'ڭ', ' ', 'ئ', 'ا', 'ي', 'د', 'ى', 'ڭ', 'ل', 'ا', 'ش', 'ت', 'ۇ', 'ر', 'ا', 'ل', 'م', 'ا', 'ي', 'ۋ', 'ا', 'ت', 'ق', 'ا', 'ن', 'د'