## tokenizer

In [1]:
from transformers import AutoTokenizer
from datasets import load_dataset
from tqdm import tqdm

In [2]:
dataset = load_dataset('./codeparrot/', split='train', streaming=True)

Resolving data files:   0%|          | 0/184 [00:00<?, ?it/s]

In [3]:
iter_dataset = iter(dataset)

In [4]:
tokenizer = AutoTokenizer.from_pretrained('gpt2')

### build

In [5]:
from transformers.models.gpt2.tokenization_gpt2 import bytes_to_unicode
bytes_to_unicode_map = bytes_to_unicode()
unicode_to_bytes_map = dict((v, k) for k, v in bytes_to_unicode_map.items())
base_vocab = list(unicode_to_bytes_map.keys())

In [6]:
length = 100000
def batch_iterator(batch_size=1000):
#     for _ in tqdm(range(0, len(dataset), batch_size)):
    for _ in tqdm(range(0, length, batch_size)):
        yield [next(iter_dataset)['content'] for _ in range(batch_size)]

In [7]:
# 比较耗时
new_tokenizer = tokenizer.train_new_from_iterator(batch_iterator(), 
                                                  vocab_size=12500, 
                                                  initial_alphabet=base_vocab)

100%|██████████| 100/100 [00:24<00:00,  4.10it/s]







In [8]:
new_tokenizer

GPT2TokenizerFast(name_or_path='gpt2', vocab_size=12500, model_max_length=1024, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>'}, clean_up_tokenization_spaces=True)

### new tokenizer

In [11]:
tokens = sorted(new_tokenizer.vocab.items(), key=lambda x: x[1], reverse=False)

In [17]:
base_vocab[:10]

['!', '"', '#', '$', '%', '&', "'", '(', ')', '*']

In [30]:
# 257:280
[(t, new_tokenizer.convert_tokens_to_string([t])) for t, _ in tokens[257:280]]

[('ĠĠ', '  '),
 ('ĠĠĠĠ', '    '),
 ('ĠĠĠ', '   '),
 ('ĠĠĠĠĠĠĠĠ', '        '),
 ('se', 'se'),
 ('in', 'in'),
 ('ĠĠĠĠĠĠĠ', '       '),
 ('re', 're'),
 ('on', 'on'),
 ('te', 'te'),
 ('ĊĠĠĠĠĠĠĠ', '\n       '),
 ('ĊĠĠĠĠĠĠĠĠ', '\n        '),
 ('or', 'or'),
 ('st', 'st'),
 ('de', 'de'),
 ('ĊĠĠĠ', '\n   '),
 ('th', 'th'),
 ('le', 'le'),
 ('Ġ=', ' ='),
 ('lf', 'lf'),
 ('self', 'self'),
 ('me', 'me'),
 ('al', 'al')]

In [33]:
# last 15
[(t, new_tokenizer.convert_tokens_to_string([t])) for t, _ in tokens[-15:]]

[('ĠSetup', ' Setup'),
 ('publisher', 'publisher'),
 ('DER', 'DER'),
 ('Ġcapt', ' capt'),
 ('Ġembedded', ' embedded'),
 ('Ġregarding', ' regarding'),
 ('Bundle', 'Bundle'),
 ('355', '355'),
 ('Ġrecv', ' recv'),
 ('Ġdmp', ' dmp'),
 ('Ġvault', ' vault'),
 ('ĠMongo', ' Mongo'),
 ('Ġpossibly', ' possibly'),
 ('implementation', 'implementation'),
 ('Matches', 'Matches')]

### python keyword

In [34]:
# python 标准库
import keyword

In [35]:
len(keyword.kwlist)

35

In [39]:
for kw in keyword.kwlist:
    if kw not in new_tokenizer.vocab:
        print(f'`{kw}` not in the new tokenizer')

`await` not in the new tokenizer
`finally` not in the new tokenizer
`nonlocal` not in the new tokenizer


### retrain

In [40]:
length = 100000*2
def batch_iterator(batch_size=1000):
    for _ in tqdm(range(0, length, batch_size)):
        yield [next(iter_dataset)['content'] for _ in range(batch_size)]
new_tokenizer = tokenizer.train_new_from_iterator(batch_iterator(), 
                                                  vocab_size=32768, 
                                                  initial_alphabet=base_vocab)

100%|██████████| 200/200 [00:48<00:00,  4.15it/s]







In [41]:
for kw in keyword.kwlist:
    if kw not in new_tokenizer.vocab:
        print(f'`{kw}` not in the new tokenizer')

`nonlocal` not in the new tokenizer


### to the hub

In [45]:
# from huggingface_hub import create_repo
# create_repo('codeparrot')

RepoUrl('https://huggingface.co/lanchunhui/codeparrot', endpoint='https://huggingface.co', repo_type='model', repo_id='lanchunhui/codeparrot')

In [51]:
import os
os.environ['HTTP_PROXY'] = 'http://127.0.0.1:7890'
os.environ['HTTPS_PROXY'] = 'http://127.0.0.1:7890'

In [52]:
ckpt = 'codeparrot'
org = 'lanchunhui'
new_tokenizer.push_to_hub(ckpt, organization=org)

CommitInfo(commit_url='https://huggingface.co/lanchunhui/codeparrot/commit/212e703fb744884f9563cfbf6de94ffd5792606a', commit_message='Upload tokenizer', commit_description='', oid='212e703fb744884f9563cfbf6de94ffd5792606a', pr_url=None, pr_revision=None, pr_num=None)