## Make a Korean Tokenizer and merged

In [1]:
from datasets import load_dataset
import json
from transformers import AutoTokenizer

In [2]:
raw_datasets = load_dataset("../../datasets/privateLLM/test_data/", "python")

In [3]:
raw_datasets

DatasetDict({
    test: Dataset({
        features: ['text'],
        num_rows: 158470
    })
})

In [4]:
sources = []
for example in raw_datasets["test"]:
    for key in example.keys():
        if key == 'input' and example.get("input", "") == "":
            continue
        text_dict = {'text': example[key]}
        sources.append(text_dict)

In [5]:
json_data = json.dumps(sources, ensure_ascii=False)

with open('../../datasets/privateLLM/test_data.jsonl', 'w', encoding='UTF-8') as f:
    f.write(json_data)

## preprocess .json file to one sentence



In [6]:
def get_training_corpus():
    dataset = raw_datasets["test"]
    for start_idx in range(0, len(dataset), 1000):
        samples = dataset[start_idx : start_idx + 1000]
        yield samples["text"]
        
training_corpus = get_training_corpus()

In [7]:
training_corpus

<generator object get_training_corpus at 0x0000027546379E00>

In [8]:
raw_datasets

DatasetDict({
    test: Dataset({
        features: ['text'],
        num_rows: 158470
    })
})

In [9]:
old_tokenizer = AutoTokenizer.from_pretrained('./tokenizertest/mistral_tokenizer', trust_remote_code=True)

In [10]:
example = "마파두부의 매력은 어디까지인가?"

tokens = old_tokenizer.tokenize(example)
tokens

['▁',
 '마',
 '파',
 '두',
 '부',
 '의',
 '▁',
 '매',
 '력',
 '은',
 '▁',
 '어',
 '디',
 '까',
 '지',
 '인',
 '가',
 '?']

In [11]:
old_tokenizer.is_fast

True

In [12]:
tokenizer = old_tokenizer.train_new_from_iterator(training_corpus, 32000)

In [13]:
len(tokenizer)

32000

In [14]:
len(old_tokenizer)

32000

In [16]:
vocab = tokenizer.get_vocab()

In [17]:
vocab_list = list(vocab.keys())
old_tokenizer.add_tokens(vocab_list)

32000

In [18]:
len(old_tokenizer)

60378

In [19]:
tokens = tokenizer.tokenize(example)
tokens

['▁마', '파', '두', '부의▁', '매', '력은▁', '어디', '까지', '인가', '?']

In [20]:
tokens = old_tokenizer.tokenize(example)
tokens

['▁마', '파', '두', '부', '의', '▁매력', '은', '▁어디', '까', '지', '인', '가', '?']

In [21]:
old_tokenizer.save_pretrained("./test_tokenizer3")

('./test_tokenizer3\\tokenizer_config.json',
 './test_tokenizer3\\special_tokens_map.json',
 './test_tokenizer3\\tokenizer.model',
 './test_tokenizer3\\added_tokens.json',
 './test_tokenizer3\\tokenizer.json')

### Test Old & New Tokenizer

In [5]:
sources = ['1','2','3','4','5']
targets = ['a','b','c','d','e']
examples = [s + t for s, t in zip(sources, targets)]
examples

['1a', '2b', '3c', '4d', '5e']

In [9]:
examples_tokenized, sources_tokenized = [strings for strings in (examples, sources)]
sources_tokenized
# input_ids = examples_tokenized["input_ids"]
# labels = copy.deepcopy(input_ids)
# for label, source_len in zip(labels, sources_tokenized["input_ids_lens"]):
#     label[:source_len] = IGNORE_INDEX
# return dict(input_ids=input_ids, labels=labels)

['1', '2', '3', '4', '5']

In [10]:
import operator

operator.ne(sources, '1')

True

In [3]:
MODEL = 'beomi/KoAlpaca-Polyglot-5.8B'

tokenizer_koalpaca = AutoTokenizer.from_pretrained(MODEL, use_fast=False)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [4]:
len(tokenizer_koalpaca)

30003