# Tokenizer

## Importing the Libraries

In [1]:
from transformers import BertModel, AutoTokenizer
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


## Defining the model and the tokenizer

In [2]:
model_name = "bert-base-cased"

model = BertModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

## Defining the sentence to be tokenized

In [3]:
sentence = "When life gives you lemons, don't make lemonade."

## Tokenizing the sentence

In [6]:
tokens = tokenizer.tokenize(sentence)
print(tokens)

['When', 'life', 'gives', 'you', 'lemon', '##s', ',', 'don', "'", 't', 'make', 'lemon', '##ade', '.']


## Displaying the tokenizer's vocabulary list

In [7]:
vocab = tokenizer.vocab
vocab_df = pd.DataFrame({"token":vocab.keys(), "token_id":vocab.values()})
vocab_df = vocab_df.sort_values(by="token_id").set_index("token_id")
print(vocab_df)

              token
token_id           
0             [PAD]
1         [unused1]
2         [unused2]
3         [unused3]
4         [unused4]
...             ...
28991           ##）
28992           ##，
28993           ##－
28994           ##／
28995           ##：

[28996 rows x 1 columns]


## Displaying the token IDs for the sentence

In [8]:
token_ids = tokenizer.encode(sentence)
print(token_ids)

[101, 1332, 1297, 3114, 1128, 22782, 1116, 117, 1274, 112, 189, 1294, 22782, 6397, 119, 102]


## Comparing the token IDs and tokens

In [18]:
print(list(zip(tokens, token_ids[1: -1])))



[('When', 1332), ('life', 1297), ('gives', 3114), ('you', 1128), ('lemon', 22782), ('##s', 1116), (',', 117), ('don', 1274), ("'", 112), ('t', 189), ('make', 1294), ('lemon', 22782), ('##ade', 6397), ('.', 119)]


## Decoding the tokens back into the sentence

In [21]:
print(tokenizer.decode(token_ids))

print(vocab_df.iloc[101])
print(vocab_df.iloc[102])

# The tokenizer inserts special edge tokens. 101 is CLS and 102 is SEP

[CLS] When life gives you lemons, don't make lemonade. [SEP]
token    [CLS]
Name: 101, dtype: object
token    [SEP]
Name: 102, dtype: object


## Displaying the entire output of the tokenizer, which is the input to the model

In [22]:
tokenizer_out = tokenizer(sentence)
print(tokenizer_out)

{'input_ids': [101, 1332, 1297, 3114, 1128, 22782, 1116, 117, 1274, 112, 189, 1294, 22782, 6397, 119, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


## Creating another sentence to show the difference in the attention mask

In [25]:
sentence2 = sentence.replace("don't ", "")

tokenizer_out2 = tokenizer([sentence, sentence2], padding=True)

print(tokenizer_out2)

print(tokenizer.decode(tokenizer_out2["input_ids"][0]))
print(tokenizer.decode(tokenizer_out2["input_ids"][1]))

# the attention mask tells the model to ignore the [PAD] characters.

{'input_ids': [[101, 1332, 1297, 3114, 1128, 22782, 1116, 117, 1274, 112, 189, 1294, 22782, 6397, 119, 102], [101, 1332, 1297, 3114, 1128, 22782, 1116, 117, 1294, 22782, 6397, 119, 102, 0, 0, 0]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0]]}
[CLS] When life gives you lemons, don't make lemonade. [SEP]
[CLS] When life gives you lemons, make lemonade. [SEP] [PAD] [PAD] [PAD]
