In [None]:
%%capture
# export
from typing import *
from fastai2.basics import *

from transformers import GPT2Tokenizer, PreTrainedTokenizer, AutoTokenizer

In [None]:
# default_exp tokenizers

# Tokenizers
>

## TransformsTokenizer

In [None]:
#export
class TransformersTokenizer():
    ''' fastai want the tokenizer can handle list of string.
        use in parallel_gen() '''
    def __init__(self, tokenizer: PreTrainedTokenizer):
        self.tokenizer = tokenizer
    def __call__(self, items: List[str]):
        return map(self.tokenizer.tokenize, items)

In [None]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

texts = ['This is a test', 'Just test']
transfomersTokenizer = TransformersTokenizer(tokenizer)
tok_texts = list(transfomersTokenizer(texts))

test_eq(tok_texts, [['this', 'is', 'a', 'test'], ['just', 'test']])

HBox(children=(IntProgress(value=0, description='Downloading', max=313, style=ProgressStyle(description_width=…




HBox(children=(IntProgress(value=0, description='Downloading', max=231508, style=ProgressStyle(description_wid…




In [None]:
texts = ['This is a test', 'Just test']
# parallel_gen will return generator of (0, ['this', 'is', 'a', 'test']), (1, ['just', 'test'])
tok_texts = L(parallel_gen(TransformersTokenizer, texts, tokenizer=tokenizer)).sorted().itemgot(1)

test_eq(tok_texts, [['this', 'is', 'a', 'test'],['just', 'test']])

## GPT2DecoderTokenizer

In [None]:
# export
class GPT2DecoderTokenizer(GPT2Tokenizer):
    '''
        Add special tokens: <|bos|>, <|pad|>.
        Add <|bos|> to the begin of the tokenized string and add <|endoftext|> to the end of the tokenized string.
        For the decoder of machine translation
    '''
    def __init__(self, *inputs, **kwargs):
        super().__init__(*inputs, **kwargs)
        special_tokens_map = dict(bos_token='<|bos|>', pad_token='<|pad|>')
        self.add_special_tokens(special_tokens_map)
    def _tokenize(self, text, add_prefix_space=False):
        bpe_tokens = super()._tokenize(text, add_prefix_space=add_prefix_space)
        return [self.bos_token] + bpe_tokens + [self.eos_token]

In [None]:
tokenizer = GPT2DecoderTokenizer.from_pretrained('distilgpt2')

In [None]:
sentence = 'The dog.'
test_eq( tokenizer.tokenize(sentence), ['<|bos|>', 'The', 'Ġdog', '.', '<|endoftext|>'] )
test_eq( tokenizer.encode(sentence), [50257, 464, 3290, 13, 50256] )
test_eq( tokenizer.encode(sentence, max_length=6, pad_to_max_length=True), [50257, 464, 3290, 13, 50256, 50258] )

## Export -

In [None]:
# hide
from nbdev.export import notebook2script
notebook2script()

Converted 00_core.ipynb.
Converted index.ipynb.
