In [19]:
from transformers import BertModel, BertTokenizer
import torch

In [3]:
model = BertModel.from_pretrained('bert-base-uncased')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [5]:
'''
word_embeddings == context-free word embeddings
position_embeddings == encodes word position
token_type_embeddings == 0 or 1, used to lookup the segment embedding
'''
model.embeddings

BertEmbeddings(
  (word_embeddings): Embedding(30522, 768, padding_idx=0)
  (position_embeddings): Embedding(512, 768)
  (token_type_embeddings): Embedding(2, 768)
  (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
  (dropout): Dropout(p=0.1, inplace=False)
)

In [12]:
example_phrase = 'I am Sunggon'
tokens = tokenizer.encode(example_phrase, return_tensors='pt')

tokens

tensor([[ 101, 1045, 2572, 7042, 7446,  102]])

In [11]:
model.embeddings.word_embeddings(tokens).shape

torch.Size([1, 6, 768])

In [13]:
model.embeddings.word_embeddings(tokenizer.encode('I am Park', return_tensors='pt')).shape

torch.Size([1, 5, 768])

In [14]:
model.embeddings.position_embeddings

Embedding(512, 768)

In [23]:
torch.LongTensor(range(6)) # position indices

tensor([0, 1, 2, 3, 4, 5])

In [24]:
model.embeddings.position_embeddings(torch.LongTensor(range(6))).shape

torch.Size([6, 768])

In [28]:
model.embeddings.position_embeddings(torch.LongTensor(range(6)))

tensor([[ 1.7505e-02, -2.5631e-02, -3.6642e-02,  ...,  3.3437e-05,
          6.8312e-04,  1.5441e-02],
        [ 7.7580e-03,  2.2613e-03, -1.9444e-02,  ...,  2.8910e-02,
          2.9753e-02, -5.3247e-03],
        [-1.1287e-02, -1.9644e-03, -1.1573e-02,  ...,  1.4908e-02,
          1.8741e-02, -7.3140e-03],
        [-4.1949e-03, -1.1852e-02, -2.1180e-02,  ...,  2.2455e-02,
          5.2826e-03, -1.9723e-03],
        [-5.6087e-03, -1.0445e-02, -7.2288e-03,  ...,  2.0837e-02,
          3.5402e-03,  4.7708e-03],
        [-3.0871e-03, -1.8956e-02, -1.8930e-02,  ...,  7.4045e-03,
          2.0183e-02,  3.4077e-03]], grad_fn=<EmbeddingBackward0>)

In [25]:
model.embeddings.token_type_embeddings

Embedding(2, 768)

In [29]:
[0]*6

[0, 0, 0, 0, 0, 0]

In [26]:
torch.LongTensor([0]*6)

tensor([0, 0, 0, 0, 0, 0])

In [30]:
model.embeddings.token_type_embeddings(torch.LongTensor([0]*6)).shape

torch.Size([6, 768])

In [31]:
model.embeddings.token_type_embeddings(torch.LongTensor([0]*6))

tensor([[ 0.0004,  0.0110,  0.0037,  ..., -0.0066, -0.0034, -0.0086],
        [ 0.0004,  0.0110,  0.0037,  ..., -0.0066, -0.0034, -0.0086],
        [ 0.0004,  0.0110,  0.0037,  ..., -0.0066, -0.0034, -0.0086],
        [ 0.0004,  0.0110,  0.0037,  ..., -0.0066, -0.0034, -0.0086],
        [ 0.0004,  0.0110,  0.0037,  ..., -0.0066, -0.0034, -0.0086],
        [ 0.0004,  0.0110,  0.0037,  ..., -0.0066, -0.0034, -0.0086]],
       grad_fn=<EmbeddingBackward0>)

In [37]:
embedded_1 = model.embeddings.dropout(
    model.embeddings.LayerNorm(
        model.embeddings.word_embeddings(tokens) + \
        model.embeddings.position_embeddings(torch.LongTensor(range(6))) + \
        model.embeddings.token_type_embeddings(torch.LongTensor([0]*6))
    )
)
embedded_1

tensor([[[ 1.6855e-01, -2.8577e-01, -3.2613e-01,  ..., -2.7571e-02,
           3.8253e-02,  1.6400e-01],
         [-3.4025e-04,  5.3974e-01, -2.8805e-01,  ...,  7.5731e-01,
           8.9008e-01,  1.6575e-01],
         [-6.3496e-01,  1.9748e-01,  2.5116e-01,  ..., -4.0819e-02,
           1.3468e+00, -6.9357e-01],
         [-1.3851e+00, -1.7490e-01, -1.8621e-03,  ..., -1.9385e-01,
          -1.2197e-01,  4.6625e-02],
         [ 7.4871e-01, -3.5681e-01,  6.5539e-02,  ...,  6.4446e-01,
          -2.2006e-01,  5.4353e-01],
         [-3.2507e-01, -3.1879e-01, -1.1632e-01,  ..., -3.9602e-01,
           4.1120e-01, -7.7552e-02]]], grad_fn=<NativeLayerNormBackward0>)

In [38]:
embedded_2 = model.embeddings(tokens)
embedded_2

tensor([[[ 1.6855e-01, -2.8577e-01, -3.2613e-01,  ..., -2.7571e-02,
           3.8253e-02,  1.6400e-01],
         [-3.4026e-04,  5.3974e-01, -2.8805e-01,  ...,  7.5731e-01,
           8.9008e-01,  1.6575e-01],
         [-6.3496e-01,  1.9748e-01,  2.5116e-01,  ..., -4.0819e-02,
           1.3468e+00, -6.9357e-01],
         [-1.3851e+00, -1.7490e-01, -1.8622e-03,  ..., -1.9385e-01,
          -1.2197e-01,  4.6625e-02],
         [ 7.4871e-01, -3.5681e-01,  6.5539e-02,  ...,  6.4446e-01,
          -2.2006e-01,  5.4353e-01],
         [-3.2507e-01, -3.1879e-01, -1.1632e-01,  ..., -3.9602e-01,
           4.1120e-01, -7.7552e-02]]], grad_fn=<NativeLayerNormBackward0>)

In [40]:
embedded_1.shape, embedded_2.shape

(torch.Size([1, 6, 768]), torch.Size([1, 6, 768]))