In [4]:
# coded by Chris Ha (hac541309@gmail.com) of EleutherAI, DuckAI
# for polyglot(EleutherAI) and polylingual(DuckAI) projects
# Licensed as MIT or Apache 2.0 or later versions of these licenses

In [5]:
import tokenizers
import json
import os

In [6]:
# reconstitute into dicts
# this might be just done with dict(tokenizer_json['model']['vocab'])
def recover_vocab_dict(tokenizer_json: dict) -> dict:
    tokenizer_dict = {}
    tokenizer_list = tokenizer_json['model']['vocab']
    for nlp_pair in tokenizer_list:
        tokenizer_dict[nlp_pair[0]] = nlp_pair[1]
    return tokenizer_dict

def check_duplicates(lst):
    count_dict = {}
    for item in lst:
        if item[0] in count_dict:
            count_dict[item[0]] += 1
        else:
            count_dict[item[0]] = 1
    duplicates = {key: value for key, value in count_dict.items() if value > 1}
    return duplicates

In [7]:
# equal : averages nlp
# min : takes the lowest (unlikeliest) of two options
# max : takes the highest (likeliest) of two options
nlp_mode = "max"
# this epsilon value is later used to separate byte level tokens 
# They will have the lowest negative log probs, the most unlikeliest.
# This will put them on the bottom, and other characters or tokens will be tokenized above them.

epsilon = 1E-7
# whether to bring in added_tokens of additional vocabs
with_added_tokens = False

# Since the ByteLevel works as its name suggests, at the byte level, it encodes each byte value to a unique visible character. 
# This means that there is a total of 256 different characters composing this alphabet.
byte_level_alphabet = sorted(tokenizers.pre_tokenizers.ByteLevel.alphabet())
assert len(byte_level_alphabet) == 256

# if not None : trim to vocab_size.
target_vocab_size = 65536
# the number we set the internal tokens to achieve the final intended vocab size
target_intermediate_size = None

In [58]:
# everything except the vocab is inherited from base
# special tokens, add tokens, preprocessing etc
base_tokenizer_path = "/home/karyo/corpus/models/merging/bulm_kr_65k.json"
add_tokenizer_path = "/home/karyo/corpus/models/merging/bulm_en_32k.json"
assert os.path.isfile(base_tokenizer_path)
assert os.path.isfile(add_tokenizer_path)
new_tokenizer_path = "/home/karyo/corpus/models/merging/final.json"
base_tokenizer = tokenizers.Tokenizer.from_file(base_tokenizer_path)

In [10]:
base_total_size = base_tokenizer.get_vocab_size(True)
base_unadded_size = base_tokenizer.get_vocab_size(False)
print(f"Total Size = {base_total_size}, Unadded size = {base_unadded_size}")

#internal representation might be different due to <bos> <eos>
base_internal_total = len(base_tokenizer.get_vocab(True))
base_internal_unadded = len(base_tokenizer.get_vocab(False))
print(f"Internal Total Size = {base_internal_total}, Unadded size = {base_internal_unadded}")
assert (base_total_size - base_unadded_size) == (base_internal_total - base_internal_unadded)
internal_offset = base_total_size - base_internal_total
added_offset = base_total_size - base_unadded_size

# we are setting the intermediate_size according to offsets
# the difference arises from how huggingface tokenizers treat added tokens and special tokens
if target_vocab_size:
    target_intermediate_size = target_vocab_size - (internal_offset + added_offset)
    print(target_intermediate_size)

Total Size = 65536, Unadded size = 65513
Internal Total Size = 65536, Unadded size = 65513
65513


In [11]:
with open(base_tokenizer_path) as base_file,open(add_tokenizer_path) as add_file:
    base_tokenizer_json = json.load(base_file)
    add_tokenizer_json = json.load(add_file)
    # this only works for unigram
    assert base_tokenizer_json['model']['type'].lower() == "unigram"
    assert add_tokenizer_json['model']['type'].lower() == "unigram"

In [12]:
# base_tokenizer_dict = recover_vocab_dict(base_tokenizer_json)
# add_tokenizer_dict = recover_vocab_dict(add_tokenizer_json)
base_tokenizer_dict = dict(base_tokenizer_json['model']['vocab'])
add_tokenizer_dict = dict(add_tokenizer_json['model']['vocab'])
print(len(base_tokenizer_dict), len(add_tokenizer_dict))

65513 32745


In [13]:
# go through new vocab and add them
# we take the max because averaging actually reduces the negative logprob compared to other tokens
# these tokens are those that are shared! there is no reason for them to be penalized
# we take the maximum of each and bump them up a small(epsilon amount) 
# but we don't want them to exceed 0
for token in add_tokenizer_dict:
    if token in base_tokenizer_dict:
        base_tokenizer_dict[token] = min(
            0, max(base_tokenizer_dict[token], add_tokenizer_dict[token]) + epsilon
        )
    else:
        base_tokenizer_dict[token] = add_tokenizer_dict[token]

In [47]:
# sort by negative logprob, first, then length then vocab
sort_key = lambda x: [-x[1],x[0]]
# huggingface tokenizers match added/special tokens directly with the ids
# so this sorting process can cause a lot of pain
# this can be helped by adding sorted(special_tokens) before using this tool.
# another alternative is to manually realign the ids
sorted_base = sorted(base_tokenizer_dict.items(), key=sort_key)
print(len(sorted_base))

82051


In [15]:
# single_tokens = list(filter(lambda token:len(token[0])==1,sorted_base))
# assert len(single_tokens) == 256

In [48]:
# reconstitute vocabs. only take {target_vocab_size} number of items
# if target_vocab_size is None. the slice does nothing

merged_vocab = list(map(list,sorted_base))[:target_intermediate_size]
#ensure byte tokens are the unlikeliest
min_log_prob = min(merged_vocab,key=lambda item:item[1])[1] - epsilon
print(len(merged_vocab), min_log_prob)

65513 -14.957779311964392


In [None]:
# uncomment to forcibly remove bytelevel token items

# merged_vocab = [item for item in merged_vocab if item[0] not in byte_level_alphabet]
# for item in merged_vocab:
#    if item[0] in byte_level_alphabet:
#        print(item)

In [49]:
# here we do 2 things
# 1. push any existing byte_level_alphabets into a low negative log prob
# 2. add any byte alphabets back into it, but not append but replace (to keep vocab_size)
not_included_count = 0
for byte in byte_level_alphabet:
    included=False
    # if byte
    for item in merged_vocab:
        if byte == item[0]:
            item[1] = min_log_prob
            included=True
            break
    # if byte token isnt included (was trimmed)
    # add it inplace of an unlikely token that is not a byte token.
    if not included:
        not_included_count +=1
        for item in reversed(merged_vocab):
            if item[0] in byte_level_alphabet:
                continue
            else:
                print(f"vocab item {item[0]}, replaced with {byte}")
                item[0] = byte
                item[1] = min_log_prob
                break
merged_vocab = sorted(merged_vocab, key=sort_key)
assert len(merged_vocab)
print(len(merged_vocab), not_included_count)

vocab item cynical, replaced with À
vocab item traditions, replaced with Á
vocab item íĻįìĬ¹, replaced with È
vocab item unden, replaced with Ô
vocab item pearl, replaced with Ý
vocab item ìıĺìķĦ, replaced with ß
vocab item spawned, replaced with ä
vocab item ê·¸ìĥĪ, replaced with ñ
vocab item Ġfirearm, replaced with ò
vocab item ìľłëĿ¼ìĭľìķ, replaced with ó
vocab item íİľëįĶ, replaced with ô
vocab item ë£¨íĨł, replaced with õ
vocab item Ġtyrann, replaced with ö
vocab item íĪ¬ë¥´íģ¬, replaced with ÷
vocab item ì½Ķë¸ĮëĿ¼, replaced with ø
vocab item Ġtanky, replaced with ù
vocab item hesitan, replaced with ú
vocab item TDC, replaced with û
vocab item ãħĨãĦ±ãĦ´, replaced with ü
vocab item ĠACADEMY, replaced with ý
vocab item welve, replaced with þ
vocab item Ġhips, replaced with ÿ
vocab item ange, replaced with ā
vocab item tua, replaced with Ă
vocab item hinge, replaced with Ą
vocab item ëłĪíĥĢ, replaced with ą
vocab item ë¯¸ëĭĪìĸ¸, replaced with Ć
vocab item íĥĢìĤ°, replaced with ċ
voca

In [50]:
print(len(base_tokenizer_json['model']['vocab']), len(merged_vocab))

65513 65513


In [51]:
base_tokenizer_json['model']['vocab'] = merged_vocab


['Ń', -14.957779311964392]

In [59]:
with open(new_tokenizer_path, "w") as new_file:
    json.dump(base_tokenizer_json, new_file,indent=2,ensure_ascii=False)