# Tokenizer

## Importing the Libraries

In [None]:
from transformers import BertModel, AutoTokenizer
import pandas as pd

## Defining the model and the tokenizer

In [None]:
model_name = "bert-base-cased"

model = BertModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

## Defining the sentence to be tokenized

In [None]:
sentence = "When life gives you lemons, don't make lemonade."

## Tokenizing the sentence

In [None]:
tokens = tokenizer.tokenize(sentence)
print(tokens)

## Displaying the tokenizer's vocabulary list

In [None]:
vocab = tokenizer.vocab
vocab_df = pd.DataFrame({"token":vocab.keys(), "token_id":vocab.values()})
vocab_df = vocab_df.sort_values(by="token_id").set_index("token_id")
print(vocab_df)

## Displaying the token IDs for the sentence

In [None]:
token_ids = tokenizer.encode(sentence)
print(token_ids)

## Comparing the token IDs and tokens

In [None]:
print(list(zip(tokens, token_ids[1: -1])))



## Decoding the tokens back into the sentence

In [None]:
print(tokenizer.decode(token_ids))

print(vocab_df.iloc[101])
print(vocab_df.iloc[102])

# The tokenizer inserts special edge tokens. 101 is CLS and 102 is SEP

## Displaying the entire output of the tokenizer, which is the input to the model

In [None]:
tokenizer_out = tokenizer(sentence)
print(tokenizer_out)

## Creating another sentence to show the difference in the attention mask

In [None]:
sentence2 = sentence.replace("don't ", "")

tokenizer_out2 = tokenizer([sentence, sentence2], padding=True)
# Padding to make both sentences the same token length

print(tokenizer_out2)

print(tokenizer.decode(tokenizer_out2["input_ids"][0]))
print(tokenizer.decode(tokenizer_out2["input_ids"][1]))

# the attention mask tells the model to ignore the [PAD] characters.