In [29]:
import json
import random
import re
from copy import deepcopy

from tqdm import tqdm
from transformers import AutoTokenizer

In [2]:
# Load trained tokenizer
tokenizer_config = json.load(open("./assets/aux_tokenizer.json", "r"))

In [3]:
# Check trained tokens
target_tokens = {}

vocabs = tokenizer_config["model"]["vocab"]

for token, ind in vocabs.items():
    if matched:=re.match("[가-힣]", token):
        if len(token) > 1:
            target_tokens[token] = ind

print(f"total tokens trained: {len(vocabs)}")
print(f"tokens to add: {len(target_tokens)}")
print(f"\texamples: {random.sample(list(target_tokens.keys()), k=10)}")


total tokens trained: 3878
tokens to add: 2201
	examples: ['국세', '전혀', '무엇인가요:', '시의', '디지', '할까요', '응답(추', '배트', '양육', '다윗']


In [30]:
# Filter already existed

model_id = "Qwen/Qwen2.5-7B"
base_tokenizer = AutoTokenizer.from_pretrained(model_id)
base_vocab = {}
for base_token, ind in tqdm(base_tokenizer.vocab.items()):
    decoded = base_tokenizer.decode([ind])
    base_vocab[decoded] = ind

filtered_tokens = []
for token, _ in tqdm(target_tokens.items()):
    base_token = base_vocab.get(token)
    if base_token is None:
        filtered_tokens.append(token)

print(f"Tokens filtered: {len(filtered_tokens)}/{len(target_tokens)}")
new_tokenizer = deepcopy(base_tokenizer)


100%|██████████| 151665/151665 [00:00<00:00, 255553.39it/s]
100%|██████████| 2201/2201 [00:00<00:00, 2120271.73it/s]


Tokens filtered: 2109/2201


In [39]:
print(f"Before: {len(base_tokenizer)}")
new_tokenizer.add_tokens(list(filtered_tokens))
print(f"After: {len(new_tokenizer)}")

# samples = " ".join(random.sample(filtered_tokens, 10))
samples = "한국어를 잘 학습했는가 확인해보겠습니다."

print([base_tokenizer.decode([tok]) for tok in base_tokenizer.encode(samples)])
print([new_tokenizer.decode([tok]) for tok in new_tokenizer.encode(samples)])

Before: 151665
After: 153774
['한', '국', '어', '를', ' 잘', ' �', '�', '습', '했', '는', '가', ' 확인', '해', '보', '겠습니다', '.']
['한국어', '를', ' 잘', ' ', '학습', '했', '는가', ' ', '확인', '해', '보', '겠습니다', '.']
