In [1]:
# coded by Chris Ha (hac541309@gmail.com) of EleutherAI, DuckAI
# for polyglot(EleutherAI) and polylingual(DuckAI) projects
# Licensed as MIT or Apache 2.0 or later versions of these licenses

In [2]:
import tokenizers
import json
import os

In [3]:
# equal : averages nlp
# min : takes the lowest (unlikeliest) of two options
# max : takes the highest (likeliest) of two options
nlp_mode = "max"
# this epsilon value is later used to separate byte level tokens 
# They will have the lowest negative log probs, the most unlikeliest.
# This will put them on the bottom, and other characters or tokens will be tokenized above them.
epsilon = 1E-7

# sort by negative logprob, first, then length then vocab
sort_key = lambda vocab_logprob_pair: [-vocab_logprob_pair[1],vocab_logprob_pair[0]]

# if not None : trim to vocab_size.
target_vocab_size = 256000
# the number we set the internal tokens to achieve the final intended vocab size
target_intermediate_size = None

In [4]:
# This section initializes byte vocabulary and unicode exemplars
# Korean is handled differently from the other exemplars due to quantity

# Since the ByteLevel works as its name suggests, at the byte level, it encodes each byte value to a unique visible character. 
# This means that there is a total of 256 different characters composing this alphabet.
byte_level = tokenizers.pre_tokenizers.ByteLevel(add_prefix_space = False)
byte_level.add_prefix_space = False
byte_level_alphabet = sorted(tokenizers.pre_tokenizers.ByteLevel.alphabet())
assert len(byte_level_alphabet) == 256
whitespaces = ["  ","    ","        ","                "]

with open("/home/karyo/corpus/data/unicode/non_korean_exemplars.json") as f:
    non_korean_exemplars = json.load(f)
    non_korean_exemplars += whitespaces
    non_korean_exemplar_byte = []
    for exemplar in non_korean_exemplars:
        non_korean_exemplar_byte.append(byte_level.pre_tokenize_str(exemplar)[0][0])

with open("/home/karyo/corpus/data/unicode/ksx-1001.txt") as f:
    korean_exemplars = f.read().split()
    korean_exemplar_byte = []
    for exemplar in korean_exemplars:
        korean_exemplar_byte.append(byte_level.pre_tokenize_str(exemplar)[0][0])

essential_byte_tokens = set(byte_level_alphabet + non_korean_exemplar_byte + korean_exemplar_byte)
essential_tokens = set(non_korean_exemplars + korean_exemplars)

In [5]:
base_tokenizer_path = "/home/karyo/corpus/models/hplt/rc/base_v2.json"
assert os.path.isfile(base_tokenizer_path)
new_tokenizer_path = "/home/karyo/corpus/models/hplt/rc/v2_exemplars.json"
base_tokenizer = tokenizers.Tokenizer.from_file(base_tokenizer_path)
essential_plus = list(essential_byte_tokens)

In [6]:
base_total_size = base_tokenizer.get_vocab_size(True)
base_unadded_size = base_tokenizer.get_vocab_size(False)
assert base_total_size == len(base_tokenizer.get_vocab(True))
assert base_unadded_size == len(base_tokenizer.get_vocab(False))
# special tokens might not considered added
print(f"Total Size = {base_total_size}, Unadded size = {base_unadded_size}")
added_offset = base_total_size - base_unadded_size
if target_vocab_size:
    target_intermediate_size = target_vocab_size - added_offset
    print(target_intermediate_size)

Total Size = 270177, Unadded size = 270177
256000


In [7]:
with open(base_tokenizer_path) as base_file:
    base_tokenizer_json = json.load(base_file)
    assert base_tokenizer_json['model']['type'].lower() == "unigram"

In [8]:
base_tokenizer_dict = dict(base_tokenizer_json['model']['vocab'])
sorted_base = sorted(base_tokenizer_dict.items(), key=sort_key,reverse=True)
min_log_prob = min(sorted_base, key = lambda pair: pair[1])[1]

In [9]:
# artificially add all essential tokens
for token in essential_plus:
    if token not in base_tokenizer_dict:
        sorted_base.append((token,min_log_prob))
sorted_base.sort(key=sort_key)
len(sorted_base)

276493

In [10]:
# remove until size matches target
# do not remove essential tokens
for pair in sorted_base:
    if len(sorted_base) <= target_vocab_size:
        break
    if pair[0] in essential_plus or pair[0].startswith("<|"):
        continue
    else:
        sorted_base.remove(pair)

# sort vocab
sorted_base.sort(key=sort_key)
len(sorted_base)

256000

In [11]:
# set the foundational tokens to have smallest log probs
new_tokenizer_list = []
for pair in sorted_base:
    vocab = pair[0]
    if vocab in essential_byte_tokens:
        nlp = min_log_prob - epsilon * (len(vocab) - 1)
    else:
        nlp = pair[1]
    new_tokenizer_list.append((vocab,nlp))
len(new_tokenizer_list)

256000

In [12]:
new_tokenizer_dict = dict(
    sorted(new_tokenizer_list, key=lambda pair: (-pair[1], pair[0]))
)

In [13]:
base_tokenizer_json['model']['vocab'] = list(new_tokenizer_dict.items())

In [14]:
# write new tokenizer
with open(new_tokenizer_path, "w") as new_file:
    json.dump(base_tokenizer_json, new_file,indent=2,ensure_ascii=False)

# check if new tokenizer can be loaded
new_tokenizer = tokenizers.Tokenizer.from_file(new_tokenizer_path)
if target_vocab_size:
    assert target_vocab_size == new_tokenizer.get_vocab_size(True)