In [None]:
%%capture
#export
from fastcore.all import *
from fastai2.basics import *
from fastai2.text.all import *

from transformers import PreTrainedTokenizer, AutoTokenizer

In [None]:
# default_exp transforms

# Transforms
> Numericalize and Padding

## TransformersNumericalize

In [None]:
#export
class TransformersNumericalize(Transform):
    def __init__(self, tokenizer: PreTrainedTokenizer):
        self.tokenizer = tokenizer
    def encodes(self, o):
        ''' o: list of string token, returns: tensored list of int token '''
        return TensorText(
            self.tokenizer.build_inputs_with_special_tokens(
                self.tokenizer.convert_tokens_to_ids(o)
            )
        )
    def decodes(self, o):
        return TitledStr(self.tokenizer.decode(list(o)))

In [None]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

tok_list = ['[CLS]', 'this', 'is', 'a', 'test', '[SEP]']
num_list = TensorText([ 101, 2023, 2003, 1037, 3231,  102])
transformersNumericalizer = TransformersNumericalize(tokenizer)

test_eq(transformersNumericalizer.encodes(tok_list), num_list)
test_eq(transformersNumericalizer.decodes(num_list), '[CLS] this is a test [SEP]')

## Pad2Max

In [None]:
#export
class Pad2Max(Transform):
    ''' pad rank one tensor by pad_idx to max_len, if original len is larger than max_len, truncate it
    '''
    def __init__(self, max_len, pad_idx):
        self.max_len = max_len
        self.pad_idx = pad_idx
    def encodes(self, o):
        ori_len = len(o)
        result = o
        if ori_len > self.max_len:
            result = o[:self.max_len]
        elif ori_len < self.max_len:
            result = nn.functional.pad(o, [0, self.max_len-ori_len], value=self.pad_idx)
        assert len(result) == self.max_len, f'len(o): {len(o)}, max_len: {self.max_len}'
        return result
    def decodes(self, o):
        return TensorText([v for v in o if v != self.pad_idx])

In [None]:
pad2max = Pad2Max(10, 1)

num_list = torch.tensor([ 101, 2023, 2003, 1037, 3231,  102])
padded_num_list = torch.tensor([ 101, 2023, 2003, 1037, 3231,  102,    1,    1,    1,    1])
test_eq(pad2max(num_list), padded_num_list)

pad2max = Pad2Max(3, 1)
num_list = torch.tensor([ 101, 2023, 2003, 1037, 3231,  102])
padded_num_list = torch.tensor([ 101, 2023, 2003])
test_eq(pad2max(num_list), padded_num_list)

pad2max = Pad2Max(6, 1)
num_list = torch.tensor([ 101, 2023, 2003, 1037, 3231,  102])
padded_num_list = torch.tensor([ 101, 2023, 2003, 1037, 3231,  102])
test_eq(pad2max(num_list), padded_num_list)

## Export -

In [None]:
#hide
from nbdev.export import notebook2script
notebook2script()

Converted 00_core.ipynb.
Converted 01_tokenizers.ipynb.
Converted 02_transforms.ipynb.
Converted 03_model_splits.ipynb.
Converted 04_callbacks.ipynb.
Converted 05_GeneratedLM.ipynb.
Converted 99a_example_roberta_classification.ipynb.
Converted 99b_example_gpt2_lm.ipynb.
Converted 99c_example_GeneratedLM.ipynb.
Converted index.ipynb.
