In [10]:
from transformers import BertModel, AutoTokenizer
import pandas as pd

In [2]:
import torch
print(torch.__version__)

2.0.1


In [4]:
model_name = "bert-base-cased"

model = BertModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

Downloading (…)lve/main/config.json: 100%|██████████| 570/570 [00:00<?, ?B/s] 
Downloading model.safetensors: 100%|██████████| 436M/436M [00:25<00:00, 17.3MB/s] 
Downloading (…)okenizer_config.json: 100%|██████████| 29.0/29.0 [00:00<?, ?B/s]
Downloading (…)solve/main/vocab.txt: 100%|██████████| 213k/213k [00:00<00:00, 1.88MB/s]
Downloading (…)/main/tokenizer.json: 100%|██████████| 436k/436k [00:00<00:00, 6.83MB/s]


In [7]:
sentence= "When life gives you lemons, don't make lemonade."
tokens = tokenizer.tokenize(sentence)

In [8]:
tokens

['When',
 'life',
 'gives',
 'you',
 'lemon',
 '##s',
 ',',
 'don',
 "'",
 't',
 'make',
 'lemon',
 '##ade',
 '.']

In [11]:
vocab = tokenizer.vocab
vocab_df = pd.DataFrame({"token":vocab.keys(), "token_id": vocab.values()})
vocab_df = vocab_df.sort_values(by="token_id").set_index("token_id")

In [12]:
vocab_df

Unnamed: 0_level_0,token
token_id,Unnamed: 1_level_1
0,[PAD]
1,[unused1]
2,[unused2]
3,[unused3]
4,[unused4]
...,...
28991,##）
28992,##，
28993,##－
28994,##／


In [13]:
token_ids = tokenizer.encode(sentence)

In [14]:
len(tokens)

14

In [15]:
len(token_ids)

16

This occurs because there are 2 special tokens, at the begining and the end 

In [16]:
print(vocab_df.iloc[101])
print(vocab_df.iloc[102])

token    [CLS]
Name: 101, dtype: object
token    [SEP]
Name: 102, dtype: object


In [17]:
list(zip(tokens, token_ids[1:-1]))

[('When', 1332),
 ('life', 1297),
 ('gives', 3114),
 ('you', 1128),
 ('lemon', 22782),
 ('##s', 1116),
 (',', 117),
 ('don', 1274),
 ("'", 112),
 ('t', 189),
 ('make', 1294),
 ('lemon', 22782),
 ('##ade', 6397),
 ('.', 119)]

We can get the entire sentence back by decoding it

In [18]:
tokenizer.decode(token_ids)

"[CLS] When life gives you lemons, don't make lemonade. [SEP]"

In [20]:
tokenizer_out = tokenizer(sentence)
tokenizer_out

{'input_ids': [101, 1332, 1297, 3114, 1128, 22782, 1116, 117, 1274, 112, 189, 1294, 22782, 6397, 119, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

look the attention mask, there is a bunch of ones and then there is some zeros, that is for help the self-attention process and make some dummy tokens added to fit the matrix but not to allow them into the final result

In [23]:
sentence2 = sentence.replace("don't", '')
tokenizer_out2 = tokenizer([sentence, sentence2], padding=True)
tokenizer_out2

{'input_ids': [[101, 1332, 1297, 3114, 1128, 22782, 1116, 117, 1274, 112, 189, 1294, 22782, 6397, 119, 102], [101, 1332, 1297, 3114, 1128, 22782, 1116, 117, 1294, 22782, 6397, 119, 102, 0, 0, 0]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0]]}

In [25]:
tokenizer.decode(tokenizer_out2['input_ids'][1])

'[CLS] When life gives you lemons, make lemonade. [SEP] [PAD] [PAD] [PAD]'

In [26]:
tokenizer.decode(tokenizer_out2['input_ids'][0])


"[CLS] When life gives you lemons, don't make lemonade. [SEP]"