## Modified from Getting Started with Google BERT

## From https://huggingface.co/transformers/

In [1]:
import torch
from transformers import BertConfig, BertModel, BertTokenizer

# Initializing a BERT bert-base-uncased style configuration
configuration = BertConfig()

# Initializing a model (with random weights) from the bert-base-uncased style configuration
model = BertModel(configuration)

# Accessing the model configuration
configuration = model.config

### We can also download a pre-trained BERT model -- check here: https://huggingface.co/docs/transformers/v4.25.1/en/main_classes/model#transformers.PreTrainedModel.from_pretrained. We will use the 'bert-base-uncased' model. As the name suggests, it is the BERT-base model with 12 encoders and it is trained with uncased tokens. Since we are using the BERT-base, each word will be represented by 768 features (the embedding size). 

In [2]:
# Download model and configuration from huggingface.co and cache.
model = BertModel.from_pretrained("bert-base-uncased")

### Next, we will download and load the tokenizer which is used for pretraining the bert-base-uncased model: 

In [3]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")


### Now, let's see how to preprocess the input before feeding it to the BERT. 


## Preprocessing the input 
### Define the sentence:

In [4]:
sentence = 'I am good'

### Tokenize the sentence and obtain the tokens:

In [5]:
tokens = tokenizer.tokenize(sentence)

### Let's print the uncased tokens:

In [6]:
print(tokens)

['i', 'am', 'good']


### Now, we will add the [CLS] token at the beginning and [SEP] token at the end of the sentence. If you had multiple sentences, you would have to add this tag between them.  

In [7]:
tokens = ['[CLS]'] + tokens + ['[SEP]']

### Let's look at our updated tokens list:

In [8]:
print(tokens)

['[CLS]', 'i', 'am', 'good', '[SEP]']


### As we can observe, we have [CLS] token at the beginning and sep token at the end of our tokens list. We can also observe that length of our tokens is 5.

### Say, we need to keep the length of our tokens list to 7, then, in that case, we will add two [PAD] tokens at the end as shown in the following:



In [9]:
tokens = tokens + ['[PAD]'] + ['[PAD]']

### Let's print our updated tokens list:

In [10]:
print(tokens)

['[CLS]', 'i', 'am', 'good', '[SEP]', '[PAD]', '[PAD]']




### As we can observe, now we have the tokens list consists of [PAD] tokens and the length of our tokens list is 7. Next, we create the attention mask. We set the attention mask value to 1 if the token is not a [PAD] token else we will set the attention mask to 0 as shown below:



In [11]:
attention_mask = [1 if i!= '[PAD]' else 0 for i in tokens]

### Let's print the attention_mask:

In [12]:
print(attention_mask)

[1, 1, 1, 1, 1, 0, 0]


### As we can observe, we have attention mask values 0 at the position where have [PAD] token and 1 at other positions. Next, we convert all the tokens to their token_ids as shown below: 


In [13]:
token_ids = tokenizer.convert_tokens_to_ids(tokens)


### Let's have a look at the token_ids:

In [14]:
print(token_ids)

[101, 1045, 2572, 2204, 102, 0, 0]



### From the above output, we can observe that each token is mapped to a unique token id. Now, we will convert the token_ids and attention_mask to tensors as shown below:

In [15]:
token_ids = torch.tensor(token_ids).clone().detach().unsqueeze(0)
attention_mask = torch.tensor(attention_mask).clone().detach().unsqueeze(0)

### That's it. Next, we feed the token_ids and attention_mask to the pre-trained BERT model and get the embedding --- i.e., the output of encoder 12. 

In [16]:
output = model(token_ids, attention_mask = attention_mask, output_hidden_states = True)

### Such output contains the tensor output[0] with the embeddings of all tokens (words and tags) of our sentence. 

In [17]:
print(output[0].shape)

torch.Size([1, 7, 768])


### The size [1,7,768] indicates the[batch_size, sequence_length, hidden_size].

### Our batch size is 1, the sequence length is the token length, since we have 7 tokens, the sequence length is 7, and the hidden size is the representation (embedding) size and it is 768 for the BERT-base model. 

### We can obtain the representation of each token as: 

### - output[0][0][0] gives the representation of the first token which is [CLS]
### - output[0][0][1] gives the representation of the second token which is 'I' 
### - output[0][0][2] gives the representation of the third token which is 'am' 



In [18]:
print(output[0][0][1])

tensor([-7.4807e-03, -1.8489e-01,  1.1664e-01, -5.3192e-01, -1.4383e-02,
         5.5371e-01,  4.5283e-02,  1.1347e+00, -3.9315e-02, -3.0881e-01,
        -3.5426e-01, -2.9035e-01,  5.3685e-01,  3.3270e-01,  1.5715e-01,
         1.1401e-01,  4.6529e-01,  1.3255e-01,  6.5985e-02,  8.2078e-01,
        -1.2184e-01, -5.4947e-01, -9.0832e-01,  4.4330e-01,  5.8226e-01,
         2.5752e-01, -3.0127e-01, -1.8286e-01, -1.2086e-01, -6.0585e-02,
         2.5353e-01, -8.8148e-02, -3.7724e-01,  7.7164e-01, -3.8769e-01,
        -5.8781e-01,  6.5402e-03, -3.4348e-01, -4.7344e-01,  7.3665e-01,
        -1.8006e-01, -3.4319e-01,  2.9422e-01, -1.5370e-02, -2.6498e-01,
        -4.8240e-01,  4.9075e-01, -2.4224e-01, -5.1813e-02, -7.4864e-01,
        -2.6666e-01,  3.4314e-02, -2.1519e-01,  2.9984e-01, -2.9019e-01,
         8.3109e-01,  1.8701e-01, -6.6528e-01, -2.9929e-01,  2.1505e-01,
        -6.4004e-02, -1.6274e-01,  8.3385e-01, -3.5514e-01, -5.9174e-01,
         8.8238e-01, -2.6925e-01, -5.3357e-03, -1.0