In [1]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

print("Before")
print(tokenizer.all_special_tokens) # --> ['[UNK]', '[SEP]', '[PAD]', '[CLS]', '[MASK]']
print(tokenizer.all_special_ids)    # --> [100, 102, 0, 101, 103]


special_tokens_dict = {'additional_special_tokens': ['[EOT]']}
num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)
# model.resize_token_embeddings(len(tokenizer))  # --> Embedding(30523, 768)

tok_id = tokenizer.convert_tokens_to_ids('[EOT]')  # --> 30522

print("After")
print(tokenizer.all_special_tokens) # --> ['[UNK]', '[SEP]', '[PAD]', '[CLS]', '[MASK]']
print(tokenizer.all_special_ids)    # --> [100, 102, 0, 101, 103]

Before
['[UNK]', '[SEP]', '[PAD]', '[CLS]', '[MASK]']
[100, 102, 0, 101, 103]
After
['[UNK]', '[SEP]', '[PAD]', '[CLS]', '[MASK]', '[EOT]']
[100, 102, 0, 101, 103, 30522]


In [2]:
text_to_encode = '''QUERY: I want to ask a question. [EOT]
ANSWER: Sure, ask away. [EOT]
QUERY: How is the weather today? [EOT]
ANSWER: It is nice and sunny. [EOT]
QUERY: Okay, nice to know. [EOT]
ANSWER: Would you like to know anything else?'''

enc = tokenizer.encode_plus(
  text_to_encode,
  max_length=128,
truncation=True,
  add_special_tokens=True,
  return_token_type_ids=False,
  return_attention_mask=False,
)['input_ids']

In [3]:
print(tokenizer.convert_ids_to_tokens(enc))

['[CLS]', 'query', ':', 'i', 'want', 'to', 'ask', 'a', 'question', '.', '[EOT]', 'answer', ':', 'sure', ',', 'ask', 'away', '.', '[EOT]', 'query', ':', 'how', 'is', 'the', 'weather', 'today', '?', '[EOT]', 'answer', ':', 'it', 'is', 'nice', 'and', 'sunny', '.', '[EOT]', 'query', ':', 'okay', ',', 'nice', 'to', 'know', '.', '[EOT]', 'answer', ':', 'would', 'you', 'like', 'to', 'know', 'anything', 'else', '?', '[SEP]']


In [4]:
{k:v for k,v in zip(tokenizer.all_special_tokens,tokenizer.all_special_ids)}

{'[UNK]': 100,
 '[SEP]': 102,
 '[PAD]': 0,
 '[CLS]': 101,
 '[MASK]': 103,
 '[EOT]': 30522}