# Tokenizers (PyTorch)

## Word-based

In [None]:
tokenized_text = "Jim Henson was a puppeteer".split()
print(tokenized_text)

['Jim', 'Henson', 'was', 'a', 'puppeteer']

### Loading and saving

In [None]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained("bert-base-cased")

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

In [None]:
tokenizer("Using a Transformer network is simple")

{'input_ids': [101, 7993, 170, 11303, 1200, 2443, 1110, 3014, 102],
 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0],
 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [None]:
tokenizer.save_pretrained("directory_on_my_computer")

## Encoding

### Tokenization

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

sequence = "Using a Transformer network is simple"
tokens = tokenizer.tokenize(sequence)

print(tokens)

['Using', 'a', 'transform', '##er', 'network', 'is', 'simple']

### From tokens to input IDs

In [None]:
ids = tokenizer.convert_tokens_to_ids(tokens)

print(ids)

[7993, 170, 11303, 1200, 2443, 1110, 3014]

## Decoding

In [None]:
decoded_string = tokenizer.decode([7993, 170, 11303, 1200, 2443, 1110, 3014])
print(decoded_string)

'Using a Transformer network is simple'

# Tokenization pipeline

In [None]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
sequence = "Let's try to tokenize!"
inputs = tokenizer(sequence)
print(inputs)
#[101, 2292, 1005, 1055, 3046, 2000, 19204, 4697, 999, 102]

## Split to tokens

In [None]:

from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
sequence = "Let's try to tokenize!"
tokens = tokenizer.tokenize(sequence)
print(tokens)
#[let, ', s, try, to, token, ##ize, !]

In [None]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("albert-base-v1")
sequence = "Let's try to tokenize!"
tokens = tokenizer.tokenize(sequence)
print(tokens)
#[_let, ', s, _try, _to, _to, ken, ize, !]

## Map tokens to input ids

In [None]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
sequence = "Let's try to tokenize!"
tokens = tokenizer.tokenize(sequence)
input_ids = tokenizer.convert_tokens_to_ids(tokens)
print(input_ids)
#[2292, 1005, 1055, 3046, 2000, 19204, 4697, 999]

## Add special tokens

In [None]:
final_inputs = tokenizer.prepare_for_model(input_ids)
print(final_inputs)
#[101, 2292, 1005, 1055, 3046, 2000, 19204, 4697, 999, 102]

# Decoding

In [None]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
sequence = "Let's try to tokenize!"
inputs = tokenizer(sequence)
decoded_string = tokenizer.decode(inputs["input_ids"])
print(decoded_string)
# "[CLS] let's try to tokenize! [SEP]"

In [None]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("roberta")
sequence = "Let's try to tokenize!"
inputs = tokenizer(sequence)
decoded_string = tokenizer.decode(inputs["input_ids"])
print(decoded_string)
# "<s> let's try to tokenize! </s>"

In [None]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
sequence = "Let's try to tokenize!"
inputs = tokenizer(sequence)
print(inputs)
# {'input_ids': [101, 2292, 1005, 1055, 3046, 2000, 19204, 4697, 999, 102],
# 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0],
# 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1]}