<a href="https://colab.research.google.com/github/codedm24/Transformers/blob/Transformers-Intro/Tokenizers.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Word based

In [9]:
tokenized_text = "Jim Henson was a puppeteer".split()
print(tokenized_text)

['Jim', 'Henson', 'was', 'a', 'puppeteer']


Character based

In [1]:
tokenized_text = list("Jim Henson was a puppeteer")
print(tokenized_text)

['J', 'i', 'm', ' ', 'H', 'e', 'n', 's', 'o', 'n', ' ', 'w', 'a', 's', ' ', 'a', ' ', 'p', 'u', 'p', 'p', 'e', 't', 'e', 'e', 'r']


Subword

In [10]:
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import Whitespace

tokenizer = Tokenizer(BPE())
trainer = BpeTrainer()
tokenizer.pre_tokenizer = Whitespace()

text = ["Hello Copilot, stuyding NLP"]
tokenizer.train_from_iterator(text, trainer)

encoded = tokenizer.encode("Hello Copilot, studying NLP")
print(encoded.tokens)

['Hello', 'Copilot', ',', 'st', 'u', 'd', 'y', 'i', 'ng', 'NLP']


In [8]:
from tokenizers import Tokenizer
from tokenizers.models import WordLevel
from tokenizers.trainers import WordLevelTrainer
from tokenizers.pre_tokenizers import Whitespace

tokenizer = Tokenizer(WordLevel())
trainer = WordLevelTrainer()
tokenizer.pre_tokenizer = Whitespace()

text = ["Hello Copilot, stuyding NLP"]
tokenizer.train_from_iterator(text, trainer)

encoded = tokenizer.encode("Hello Copilot, stuyding NLP")
print(encoded.tokens)

['Hello', 'Copilot', ',', 'stuyding', 'NLP']


Using Tokenizer

In [None]:
from transformers import BertTokenizer
tokeinzer = BertTokenizer.from_pretrained("bert-base-cased")
tokens = tokeinzer.tokenize("Jim Henson was a puppeteer")
print(tokens)

['Jim', 'He', '##nson', 'was', 'a', 'puppet', '##eer']


Using AutoTokenizer

In [11]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
tokens = tokenizer.tokenize("Using a Transformer network is simple")
print(tokens)

tokens = tokenizer.tokenize("Hello Copilot, studying NLP")
print(tokens)

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

['Using', 'a', 'Trans', '##former', 'network', 'is', 'simple']
['Hello', 'Co', '##pi', '##lot', ',', 'studying', 'NL', '##P']


Converting tokens to ids

In [None]:
ids = tokenizer.convert_tokens_to_ids(tokens)
print(ids)

[7993, 170, 13809, 23763, 2443, 1110, 3014]


Decoding

In [None]:
decoded_string = tokenizer.decode([7993, 170, 11303, 1200, 2443, 1110, 3014])
print(decoded_string)

Using a transformer network is simple


Handling multiple sequence

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

sequence = "Learning NLP is interesting"

tokens = tokenizer.tokenize(sequence)
print(f"Tokens: {tokens}")
input_ids = tokenizer.convert_tokens_to_ids(tokens)
print("Ids: ", type(input_ids) , ", ", input_ids)
tensor = torch.tensor([input_ids])
print("Tensor", type(tensor) , ", ", tensor)

output = model(tensor)
print("Logits: ",output.logits)

Tokens: ['learning', 'nl', '##p', 'is', 'interesting']
Ids:  <class 'list'> ,  [4083, 17953, 2361, 2003, 5875]
Tensor <class 'torch.Tensor'> ,  tensor([[ 4083, 17953,  2361,  2003,  5875]])
Logits:  tensor([[-2.4032,  2.5175]], grad_fn=<AddmmBackward0>)


Btached ids

In [None]:
tokens = tokenizer.tokenize(sequence, return_tensors="pt")
input_ids = tokenizer.convert_tokens_to_ids(tokens)
batched_ids = [input_ids,input_ids]
tensor = torch.tensor(batched_ids)
print(tensor)
output = model(tensor)
print("Logits: ", output.logits)

tensor([[ 4083, 17953,  2361,  2003,  5875],
        [ 4083, 17953,  2361,  2003,  5875]])
Logits:  tensor([[-2.4032,  2.5175],
        [-2.4032,  2.5175]], grad_fn=<AddmmBackward0>)


Padding the inputs

In [None]:
padding_id = 100
batched_ids = [
    [4083,17953,2361,2003,5875],
    [4083,17953,2361,2003,padding_id]
]
tensor = torch.tensor(batched_ids)
output = model(tensor)
print("Logits: ", output.logits)

Logits:  tensor([[-2.4032,  2.5175],
        [-0.5371,  0.6110]], grad_fn=<AddmmBackward0>)


Attention masks

In [None]:
batched_ids = [
    [4083,17953,2361,2003,5875],
    [4083,17953,2361,2003,tokenizer.pad_token_id]
]

attention_masks = [
    [1,1,1,1,1],
    [1,1,1,1,0]
]
tensor = torch.tensor(batched_ids)
tensor1 = torch.tensor(attention_masks)
output = model(tensor,attention_mask=tensor1)
print("Logits: ", output.logits)

Logits:  tensor([[-2.4032,  2.5175],
        [-1.6812,  1.6532]], grad_fn=<AddmmBackward0>)


Using all together using tokenizer()

In [None]:
sequence = "Learning NLP is interesting"
model_input = tokenizer(sequence)
print(f"Model input: {model_input}")

Model input: {'input_ids': [101, 4083, 17953, 2361, 2003, 5875, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1]}


Using multiple sequences

In [None]:
sequences = ["Learning NLP is interesting", "Learning NLP is not interesting"]
model_inputs = tokenizer(sequences)
print(f"Model input: {model_inputs}")

Model input: {'input_ids': [[101, 4083, 17953, 2361, 2003, 5875, 102], [101, 4083, 17953, 2361, 2003, 2025, 5875, 102]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1]]}


Tokenizer() settings

In [None]:
sequences = ["Learning NLP is interesting", "Learn NLP"]
model_inputs = tokenizer(sequences)
print(f"Model input: {model_inputs}")

#set padding to 'longest'
model_inputs = tokenizer(sequences, padding="longest")
print(f"Model input: {model_inputs}")

#set padding to 'max_length'
model_inputs = tokenizer(sequences, padding="max_length")
print(f"Model input: {model_inputs}")

#set padding to max_length, set max_length value
model_inputs = tokenizer(sequences, padding="max_length", max_length=8)
print(f"Model input: {model_inputs}")

#set truncation
model_inputs = tokenizer(sequences, truncation=True)
print(f"Model input: {model_inputs}")

#set truncation with max length
model_inputs = tokenizer(sequences, truncation=True, max_length=8)
print(f"Model input: {model_inputs}")

#converting tensor to specific framework - pytorch
model_inputs = tokenizer(sequences, padding=True, return_tensors="pt")
print(f"Model input: {model_inputs}")

#converting tensor to specific framework - pytorch, all settings
model_inputs = tokenizer(sequences, padding=True, truncation=True, return_tensors="pt")
print(f"Model input: {model_inputs}")

Model input: {'input_ids': [[101, 4083, 17953, 2361, 2003, 5875, 102], [101, 4553, 17953, 2361, 102]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1]]}
Model input: {'input_ids': [[101, 4083, 17953, 2361, 2003, 5875, 102], [101, 4553, 17953, 2361, 102, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 0, 0]]}
Model input: {'input_ids': [[101, 4083, 17953, 2361, 2003, 5875, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

Special Tokens

In [None]:
sequence = "Learning NLP is interesting"
model_inputs = tokenizer(sequence)
print(f"Model input: {model_inputs}")

sequence = tokenizer.decode(model_inputs["input_ids"],clean_up_tokenization_spaces=True)
print(sequence)

Model input: {'input_ids': [101, 4083, 17953, 2361, 2003, 5875, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1]}
[CLS] learning nlp is interesting [SEP]
