## Tokenizer

In [19]:
from transformers import AutoTokenizer
# ref: https://huggingface.co/docs/transformers/en/main_classes/tokenizer

In [20]:
# encode text
model_name = 'distilbert/distilroberta-base'
max_length = 32
tokenizer = AutoTokenizer.from_pretrained(model_name)

text = ["Hello World", "My name is Tom", "Today is wednesday"]

encoding = tokenizer(text, padding="max_length", truncation=True, max_length=max_length)
encoding

{'input_ids': [[0, 31414, 623, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [0, 2387, 766, 16, 1560, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [0, 5625, 16, 18862, 46836, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], 'attention_mask': [[1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]}

In [24]:
# use different padding strategies
model_name = 'distilbert/distilroberta-base'
max_length = 16
tokenizer = AutoTokenizer.from_pretrained(model_name)

def print_shape(encoding, label):
    shape = (len(encoding["input_ids"]), len(encoding["input_ids"][0]))
    print(f"[{label}] shape:", shape)

text = ["Hello World", "My name is Tom", "Today is wednesday"]

encoding = tokenizer(text, padding="max_length", truncation=True, max_length=max_length)
print_shape(encoding, "max_length")

encoding = tokenizer(text, padding="longest", truncation=True, max_length=max_length)
print_shape(encoding, "longest")

encoding = tokenizer(text, padding=False, truncation=True, max_length=max_length)

[max_length] shape: (3, 16)
[longest] shape: (3, 6)


In [33]:
# padding side: left or right
model_name = 'distilbert/distilroberta-base'
max_length = 16
tokenizer = AutoTokenizer.from_pretrained(model_name)

text = ["Hello World"]

encoding = tokenizer(text, padding="max_length", truncation=True, max_length=max_length, padding_side='left')
print("left:", encoding['input_ids'])

encoding = tokenizer(text, padding="max_length", truncation=True, max_length=max_length, padding_side='right')
print("right:", encoding['input_ids'])

left: [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 31414, 623, 2]]
right: [[0, 31414, 623, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]


In [53]:
# truncation
model_name = 'distilbert/distilroberta-base'
max_length = 8
tokenizer = AutoTokenizer.from_pretrained(model_name)

text = ("This is an example of a text that is quite lengthy and goes beyond "
        "the maximum allowed input length for Qwen models. Since we can't "
        "feed this entire text into the model without truncating it, we apply "
        "truncation to ensure that the text fits within the model's maximum input "
        "length limit.")

encoding = tokenizer(text, truncation=False, max_length=max_length)
print(encoding["input_ids"])

encoding = tokenizer(text, truncation=True, max_length=max_length)
print(encoding["input_ids"])

[0, 713, 16, 41, 1246, 9, 10, 2788, 14, 16, 1341, 8787, 8, 1411, 1684, 5, 4532, 1220, 8135, 5933, 13, 1209, 11760, 3092, 4, 1773, 52, 64, 75, 3993, 42, 1445, 2788, 88, 5, 1421, 396, 43064, 1295, 24, 6, 52, 3253, 43064, 1258, 7, 1306, 14, 5, 2788, 10698, 624, 5, 1421, 18, 4532, 8135, 5933, 3000, 4, 2]
[0, 713, 16, 41, 1246, 9, 10, 2]


In [39]:
# return tensorflow / pytorch / numpy tensor
# return_tensors (str or TensorType, optional) — If set, will return tensors instead of list of python integers. Acceptable values are:
# 'tf': Return TensorFlow tf.constant objects.
# 'pt': Return PyTorch torch.Tensor objects.
# 'np': Return Numpy np.ndarray objects.

model_name = 'distilbert/distilroberta-base'
max_length = 16
tokenizer = AutoTokenizer.from_pretrained(model_name)

text = ["Hello World"]

encoding = tokenizer(text, padding="max_length", truncation=True, max_length=max_length, return_tensors='tf')
print(type(encoding["input_ids"]))

encoding = tokenizer(text, padding="max_length", truncation=True, max_length=max_length, return_tensors='pt')
print(type(encoding["input_ids"]))

encoding = tokenizer(text, padding="max_length", truncation=True, max_length=max_length, return_tensors='np')
print(type(encoding["input_ids"]))

<class 'tensorflow.python.framework.ops.EagerTensor'>
<class 'torch.Tensor'>
<class 'numpy.ndarray'>
