<a href="https://colab.research.google.com/github/datxander/NLP/blob/main/HuggingFace/HF_tokenizers.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Word based tokenizer example

text  = "The dancer gave him a cigarette"
tokenized_text = text.split()
print(tokenized_text)

['The', 'dancer', 'gave', 'him', 'a', 'cigarette']


In [2]:
# Other approaches include character based and subword based tokenizers

In [3]:
# BERT tokenizer

from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-cased")

# Autotokenizer

from transformers import AutoTokenizer
tokenozer = AutoTokenizer.from_pretrained("bert-base-cased")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [4]:
tokenizer(text)

{'input_ids': [101, 1109, 9227, 1522, 1140, 170, 9983, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1]}

In [5]:
# Let's dive into how these input_ids were generated aka ENCODING

# Encoding is done in a two-step process: the tokenization, followed by the conversion to input IDs.

#1. Tokenization

from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

tokens = tokenizer.tokenize(text)
print(tokens)

['The', 'dancer', 'gave', 'him', 'a', 'cigarette']


In [6]:
#2. Converting these tokens to input IDs

ids = tokenizer.convert_tokens_to_ids(tokens)
print(ids)

[1109, 9227, 1522, 1140, 170, 9983]


In [7]:
# Decoding is the process of turning these tokens back into text

decoded_string = tokenizer.decode([101, 1109, 9227, 1522, 1140, 170, 9983, 102])
print(decoded_string)

[CLS] The dancer gave him a cigarette [SEP]


In [8]:
# Does this approach scale for multiple sequences? Illustration & error below

import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

sequence = "I've been waiting for a HuggingFace course my whole life."

tokens = tokenizer.tokenize(sequence)
ids = tokenizer.convert_tokens_to_ids(tokens)
input_ids = torch.tensor(ids)
# This line will fail.
model(input_ids)



tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

IndexError: too many indices for tensor of dimension 1

In [9]:
# This error is because we sent a signle sequence to the model, but Transformer models expect multiple sentences by default.
# However, the tokenizer added another dimension when it did its thing

tokenized_inputs = tokenizer(text, return_tensors = "pt")
print(tokenized_inputs["input_ids"])

tensor([[ 101, 1996, 8033, 2435, 2032, 1037, 9907,  102]])


In [10]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

sequence = "I've been waiting for a HuggingFace course my whole life."

tokens = tokenizer.tokenize(sequence)
ids = tokenizer.convert_tokens_to_ids(tokens)

input_ids = torch.tensor([ids])
print("Input IDs:", input_ids)

output = model(input_ids)
print("Logits:", output.logits)

Input IDs: tensor([[ 1045,  1005,  2310,  2042,  3403,  2005,  1037, 17662, 12172,  2607,
          2026,  2878,  2166,  1012]])
Logits: tensor([[-2.7276,  2.8789]], grad_fn=<AddmmBackward0>)


In [11]:
input_ids = [[ 1045,  1005,  2310,  2042,  3403,  2005,  1037, 17662, 12172,  2607, 2026,  2878,  2166,  1012]]
logits : [[-2.7276,  2.8789]]


In [None]:
batched_ids = [ids, ids]

In [12]:
# Input padding - All sentences have to be the same length so that they can be converted to a tensor. This is done through padding

padding_id = 100

batched_ids = [
    [200,200,200],
    [200,200,padding_id]
]


# Read up on why these 2 sentences would have different tokens when processed individually, and when they're processed as a batch. Hint - Attention layers

# https://huggingface.co/learn/nlp-course/chapter2/5?fw=pt

In [14]:
# Attention masks solve for this issue, so that we can specify what tokens should be ignored by the attention layer

batched_ids = [
    [200,200,200],
    [200,200,tokenizer.pad_token_id],
]

attention_mask = [
    [1,1,1],
    [1,1,0],
]

outputs = model(torch.tensor(batched_ids),attention_mask = torch.tensor(attention_mask))
print(output.logits)

tensor([[-2.7276,  2.8789]], grad_fn=<AddmmBackward0>)


In [2]:
# Transformers API does all the above steps directly

from transformers import AutoTokenizer

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

sequence = "It was the best of times, it was the worst of times"

model_inputs = tokenizer(sequence)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

In [3]:
# Multiple sequences are also handled similarly

sequences = ["It was the best of times", "It was the worst of times"]

model_inputs = tokenizer(sequences)

In [4]:
# Padding has several options

# Max sequence length :
model_inputs = tokenizer(sequences,padding = "longest")

# Max model length
model_inputs = tokenizer(sequence, padding = "max_length")

# Specified max length
model_inputs = tokenizer(sequence, padding = "max_length", max_length = 16)


In [5]:
# Truncation also has similar options

# Truncating sentences longer than max length
model_inputs = tokenizer(sequences, truncation = True)

# Truncating sentences longer than specified max length
model_inputs = tokenizer(sequences, max_length = 8, truncation = True)

In [6]:
# We can also convert tokens to specific framework tensors, to be sent to the models

#Pytorch
model_inputs = tokenizer(sequences, padding = True, return_tensors = "pt")

#Tensorflow
model_inputs = tokenizer(sequences, padding = True, return_tensors = "tf")


#Numpy
model_inputs = tokenizer(sequences, padding = True, return_tensors = "np")

In [9]:
# Let's bring these all together with an example

import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

dickens = ["it was the age of wisdom", "it was the age of foolishness", "it was the epoch of belief", "it was the epoch of incredulity"]

tokens = tokenizer(dickens, padding = True, truncation = True, return_tensors = "pt")
output = model(**tokens)



model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]